library(tidyverse)
library(tm)
library(tidytext)
library(dplyr)
library(wordcloud)
library(tidyr); library(ggplot2); library(ggthemes)
library(tidyverse)
library(grid)
library(gridExtra)
library(magrittr)
library(stringr)
disneyland<- read.csv(file = "disneyland.csv", stringsAsFactors = F)
rides <- read_csv('rides.csv')

Research Question No.2

Hypothesis 1:

library(grid)
library(gridExtra)

Ho: Reviews from different Disney branches tend to focus on similar topics.
Ha: Reviews from different Disney branches focus on different topics.

# split dataset based on branch
hongkong <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong')
cali <- disneyland %>%
  filter(Branch == 'Disneyland_California')
paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris')

Splited the three branches based on location and examine the reviews in the following.

Hong Kong

hongkong$Review_Text <- gsub('Hong Kong','HongKong', hongkong$Review_Text)

# create corpus
corpus = Corpus(VectorSource(hongkong$Review_Text))
corpus[[1]][1]
## $content
## [1] "If you've ever been to Disneyland anywhere you'll find Disneyland HongKong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. "
# clean text
# convert to lower case
corpus = tm_map(corpus,FUN = content_transformer(tolower))

# remove urls
corpus = tm_map(corpus,
                FUN = content_transformer(FUN = function(x)gsub(pattern = 'http[[:alnum:][:punct:]]*',
                                                                replacement = ' ',x = x)))

# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
corpus[[1]][1]
## $content
## [1] "if youve ever been to disneyland anywhere youll find disneyland hongkong very similar in the layout when you walk into main street it has a very familiar feel one of the rides  its a small world  is absolutely fabulous and worth doing the day we visited was fairly hot and relatively busy but the queues moved fairly well "
# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus[[1]][1]
## $content
## [1] " youve ever   disneyland anywhere youll find disneyland hongkong  similar   layout   walk  main street     familiar feel one   rides    small world   absolutely fabulous  worth   day  visited  fairly hot  relatively busy   queues moved fairly well "
# strip white spaces
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus[[1]][1]
## $content
## [1] " youve ever disneyland anywhere youll find disneyland hongkong similar layout walk main street familiar feel one rides small world absolutely fabulous worth day visited fairly hot relatively busy queues moved fairly well "
# create dictionary
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
                     lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))

# stem document
corpus = tm_map(corpus,FUN = stemDocument)

# create document term matrix
dtm = DocumentTermMatrix(corpus)

# remove sparse terms
xdtm = removeSparseTerms(dtm,sparse = 0.97)
xdtm
## <<DocumentTermMatrix (documents: 9147, terms: 292)>>
## Non-/sparse entries: 234093/2436831
## Sparsity           : 91%
## Maximal term length: 10
## Weighting          : term frequency (tf)
dim(xdtm)
## [1] 9147  292
# make dataframe and complete stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
                                dictionary = dict_corpus,
                                type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
word_freq_hk <- data.frame(word = colnames(xdtm), freq = colSums(xdtm))


# word cloud
# set.seed(617)
# wordcloud(words = word_freq_hk$word,word_freq_hk$freq,scale=c(2,0.5),max.words = 100,colors=brewer.pal(9,"Spectral"))



word_total_hk = sum(word_freq_hk$freq)
hk_30 <- word_freq_hk %>%
  mutate(percentage = freq/word_total_hk*100) %>%
  arrange(desc(percentage)) %>%
  head(30)

# top 30 frequent words in Hong Kong
hk_30
##                  word freq percentage
## ride             ride 9138  3.0537055
## park             park 8088  2.7028201
## disneyland disneyland 7616  2.5450888
## day               day 7045  2.3542740
## disney         disney 6953  2.3235297
## time             time 5279  1.7641181
## visit           visit 4536  1.5158249
## one               one 3914  1.3079671
## kid               kid 3877  1.2956026
## show             show 3413  1.1405446
## get               get 3408  1.1388738
## place           place 3231  1.0797245
## hongkong     hongkong 3070  1.0259221
## food             food 2999  1.0021955
## can               can 2894  0.9671070
## enjoy           enjoy 2889  0.9654361
## parad           parad 2889  0.9654361
## great           great 2878  0.9617602
## good             good 2748  0.9183172
## will             will 2473  0.8264187
## just             just 2447  0.8177301
## like             like 2435  0.8137200
## queue           queue 2286  0.7639276
## attract       attract 2257  0.7542365
## small           small 2236  0.7472188
## fun               fun 2220  0.7418720
## firework     firework 2215  0.7402011
## realli         realli 2213  0.7395328
## wait             wait 2178  0.7278366
## love             love 2172  0.7258315
#hk_30%>%
#  ggplot(aes(x=reorder(word,percentage), y=percentage, fill=percentage))+
#  geom_col(position='dodge')+
#  coord_flip()+
#  scale_fill_gradient2(low ="#FFEDF0", high = "#C84747",space ="Lab", guide = FALSE) + 
#  theme_economist()+
#  theme(plot.background = element_blank()) 

The above result shows the top 30 frequent words in Hong Kong branch. There are words that are very common in this scenario but do not convey useful information, like ‘disneyland’, ‘disney’, ‘one’, and verbs like ‘get’, ‘can’, ‘will’, etc. Therefore, by manually filtering words that might convey more meaningful information, we would be able to know what topics do the reviewer care about when they visit to each branch.

Selected top 10 topics in Hong Kong
hk_topwords <- subset(hk_30, rownames(hk_30) %in% c("ride", "park", "kid", "show", 'hongkong',
                                                    "food", "parad", "queue", "firework", "wait")) # 10 word
hk_topwords%>%
  ggplot(aes(x=reorder(word,percentage), y=percentage, fill=percentage))+
  geom_col(position='dodge')+
  coord_flip()+
  scale_fill_gradient2(low ="#FFEDF0", high = "#C84747",space ="Lab", guide = FALSE) + 
  theme_economist()+
  theme(plot.background = element_blank()) +
  labs(x = "Words", y = "Frequency in %") +
  ggtitle("Top 10 frequent topics in Hong Kong with % distribution")

Above are the selected top 10 topics with frequency percentages for Hong Kong branch. We noticed that a considerable proportion of reviewers care about ride, the park itself, care about kid, the shows, etc. Looks like people tend to be excited about the Hong Kong branch itself. Following by food, parade, the queue and wait, as well as the firework. Looks like the top topics generally cover most of the facilities that are included in a typical Disneyland park.
Next, we would use the same process to first generate top 30 frequent words for California branch and Paris branch, and then manually select meaningful topics that can provide insights.

California

# create corpus
corpus = Corpus(VectorSource(cali$Review_Text))
corpus[[1]][1]

# clean text
# convert to lower case
corpus = tm_map(corpus,FUN = content_transformer(tolower))

# remove urls
corpus = tm_map(corpus,
                FUN = content_transformer(FUN = function(x)gsub(pattern = 'http[[:alnum:][:punct:]]*',
                                                                replacement = ' ',x = x)))

# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
corpus[[1]][1]

# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus[[1]][1]

# strip white spaces
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus[[1]][1]

# create dictionary
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
                     lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))

# stem document
corpus = tm_map(corpus,FUN = stemDocument)

# create document term matrix
dtm = DocumentTermMatrix(corpus)

# remove sparse terms
xdtm = removeSparseTerms(dtm,sparse = 0.97)
xdtm
dim(xdtm)

# make dataframe and complete stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
                                dictionary = dict_corpus,
                                type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
word_freq_cali <- data.frame(word = colnames(xdtm), freq = colSums(xdtm))

# word cloud
# set.seed(617)
# wordcloud(words = word_freq_cali$word,word_freq_cali$freq,scale=c(2,0.5),max.words = 100,colors=brewer.pal(9,"Spectral"))

word_total_ca = sum(word_freq_cali$freq)
ca_30 <- word_freq_cali %>%
  mutate(percentage = freq/word_total_ca*100) %>%
  arrange(desc(percentage)) %>%
  head(30)
# top 30 frequent topics in California
ca_30
##                  word  freq percentage
## park             park 21691  3.1450362
## ride             ride 20850  3.0230973
## time             time 16646  2.4135481
## disneyland disneyland 15329  2.2225928
## day               day 14834  2.1508214
## disney         disney 12237  1.7742754
## get               get 11930  1.7297626
## line             line  9061  1.3137787
## pass             pass  8027  1.1638562
## one               one  7739  1.1220983
## wait             wait  7580  1.0990445
## can               can  7244  1.0503270
## visit           visit  7217  1.0464122
## great           great  7024  1.0184286
## place           place  6596  0.9563717
## love             love  6263  0.9080891
## just             just  6233  0.9037394
## year             year  5859  0.8495121
## kid               kid  5837  0.8463223
## crowd           crowd  5738  0.8319680
## like             like  5303  0.7688962
## will             will  5227  0.7578767
## fast             fast  5201  0.7541069
## see               see  5073  0.7355479
## peopl           peopl  4628  0.6710261
## fun               fun  4603  0.6674013
## even             even  4481  0.6497122
## much             much  4469  0.6479723
## enjoy           enjoy  4399  0.6378228
## back             back  4301  0.6236135
#ca_30%>%
#  ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
#  geom_col(position='dodge')+
#  coord_flip()+
#  scale_fill_gradient2(low ="#D6F1DF", high = "#00A632",space ="Lab", guide = FALSE) + 
#  theme_economist()+
#  theme(plot.background = element_blank()) 
Selected top 10 topics in California
ca_topwords <- subset(ca_30, rownames(ca_30) %in% c("park", "ride", "line", "pass", "wait",
                                                    "kid", "crowd", "fast", "peopl", "back")) # 10 words
ca_topwords %>%
  ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
  geom_col(position='dodge')+
  coord_flip()+
  scale_fill_gradient2(low ="#D6F1DF", high = "#00A632",space ="Lab", guide = FALSE) + 
  theme_economist()+
  theme(plot.background = element_blank()) +
  labs(x = "Words", y = "Frequency in %") +
  ggtitle("Top 10 frequent topics in California with % distribution")

Above is the top 10 most frequent topics in California branch. Comparing to the Hong Kong branch, we could easily see that words like ‘line’, ‘pass’, ‘crowd’, ‘fast’, and ‘people’ occured on the list. This might suggest that people have more to say about their queue experiences, waiting time, about the visiting crowd or visitor amount, and probably the speed and efficiency taking rides during their visits.

Paris

# create corpus
corpus = Corpus(VectorSource(paris$Review_Text))
corpus[[1]][1]

# clean text
# convert to lower case
corpus = tm_map(corpus,FUN = content_transformer(tolower))

# remove urls
corpus = tm_map(corpus,
                FUN = content_transformer(FUN = function(x)gsub(pattern = 'http[[:alnum:][:punct:]]*',
                                                                replacement = ' ',x = x)))

# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
corpus[[1]][1]

# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus[[1]][1]

# strip white spaces
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus[[1]][1]

# create dictionary
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(paris$Review_Text))),
                     lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))

# stem document
corpus = tm_map(corpus,FUN = stemDocument)

# create document term matrix
dtm = DocumentTermMatrix(corpus)

# remove sparse terms
xdtm = removeSparseTerms(dtm,sparse = 0.97)
xdtm
dim(xdtm)

# make dataframe and complete stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
                                dictionary = dict_corpus,
                                type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
word_freq_paris <- data.frame(word = colnames(xdtm), freq = colSums(xdtm))

# wordcloud
#set.seed(617)
#wordcloud(words = word_freq_paris$word,word_freq_paris$freq,scale=c(2,0.5),max.words = 100,colors=brewer.pal(9,"Spectral"))

word_total_pr = sum(word_freq_paris$freq)
pr_30 <- word_freq_paris %>%
  mutate(percentage = freq/word_total_ca*100) %>%
  arrange(desc(percentage)) %>%
  head(30)
# top 30 frequent topics in Paris
pr_30
##          word  freq percentage
## 1        park 21527  3.1212574
## 2        ride 18419  2.6706201
## 3      disney 14422  2.0910844
## 4        time 13378  1.9397120
## 5         day 12148  1.7613711
## 6         get  9485  1.3752555
## 7         one  8172  1.1848802
## 8       queue  8167  1.1841552
## 9  disneyland  7417  1.0754107
## 10      visit  6822  0.9891400
## 11       just  6351  0.9208485
## 12        can  5997  0.8695211
## 13       food  5931  0.8599516
## 14       wait  5785  0.8387826
## 15    pariah.  5479  0.7944149
## 16       good  5455  0.7909351
## 17      great  5174  0.7501921
## 18       show  5060  0.7336630
## 19        see  4953  0.7181487
## 20       love  4861  0.7048094
## 21        kid  4818  0.6985747
## 22   reallity  4788  0.6942249
## 23      place  4766  0.6910351
## 24       year  4733  0.6862503
## 25      magic  4646  0.6736360
## 26       will  4532  0.6571068
## 27      parad  4524  0.6559469
## 28  characted  4491  0.6511621
## 29       hour  4471  0.6482623
## 30       like  4420  0.6408676
#pr_30%>%
#  ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
#  geom_col(position='dodge')+
#  coord_flip()+
#  scale_fill_gradient2(low ="#D4DFFF", high = "#4169E2",space ="Lab", guide = FALSE) + 
#  theme_economist()+
#  theme(plot.background = element_blank())
Selected top 10 topics in Paris
pr_topwords <- subset(pr_30, rownames(pr_30) %in% c(1, 2, 8, 13, 14, 
                                                    15, 18, 21, 27, 28)) # 10 words
pr_topwords %>%
  ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
  geom_col(position='dodge')+
  coord_flip()+
  scale_fill_gradient2(low ="#D4DFFF", high = "#4169E2",space ="Lab", guide = FALSE) + 
  theme_economist()+
  theme(plot.background = element_blank()) +
  labs(x = "Words", y = "Frequency in %") +
  ggtitle("Top 10 frequent topics in Paris with % distribution")

Above is the top 10 most frequent topics in Paris branch. Comparing to the previous two branches, reviewers also mentioned a lot about ‘queue’, ‘food’, ‘wait’, ‘show’, ‘parade’, and ‘kid’. However, what’s worth noticing is that they also mentioned a lot about ‘pariah’ and ‘character’, which are new to the other two branches.

head(paris[grep("pariah", paris$Review_Text, ignore.case = TRUE), "Review_Text"], 2)
## [1] "Currently on holiday at Eurodisney. Have been to Disney in Florida numerous times, although extremely busy in high season the USA staff are courteous, friendly and efficient. Eurodisney is pleasant and spotless. But Oh! The service! There are practically no guides or attendants to assist you. And don't even think of trying to eat between noon and 3pm. We stood in line for fast food fare for one hour, we then stood at the counter for another fifteen minutes, we then fought for a table for a further fifteen minutes! I know that a Disney visit entails a lot of waiting in line but having to spend 90 minutes to get a fast food burger and fries is unacceptable. On informing the staff of the problem they shrugged their shoulders and basically did not care. Take a tip from me: Pay the little extra and go to the real Disney where the staff will treat you like a guest, not a pariah!"                                                                                                                                                                                                                                                                                                                                                                                                                                                               
## [2] "It was a living hell.I took 2 trained carers and a severely mentally handicapped adult. It was a special once in a lifetime treat so stayed in the very expensive Disneyland Hotel. Took in doctors letter detailing ALL his medical problems. The staff in City Hall gave us a pass and explained how to use it., they were very pleasant.Sadly Disney have not told any of their ride operators about it and we were treated like S*$T.The 2 carers stated they would NEVER return and will advise all other parents NOT to consider it as they had never been anywhere so UNFRIENDLY to the disabled.Most of the disabled exits marked for your use are UNMANNED which ties in with the Disney approach of   Who cares, we already have yer cash   One of my carers assisted a family group who were in the wrong queue and would have stood there forever ignored by park staff and took them to where they could actually access a ride.Complete SO WHAT attitude by all ride staff. Told to go to the back of the queue , what was my problem ??? Must have inhaled around 100 cigs by passive smoking as its EVERYWHERE. Food is rubbish and overpriced.We hated it, we were made to feel like pariahs, my son was signing  HOME  by Sunday afternoon and was desparate to leave..........and this is someone who has been obsessed by Disney videos etc all his life. AWFUL."
head(paris[grep("character", paris$Review_Text, ignore.case = TRUE), "Review_Text"], 2)
## [1] "3rd time hereIt gets betterPlenty to do for allShopping Meet the Character'sRoller coasters and finish off with the fireworks display at 10 beside the castleSpace Mountain and Rockin roller coaster highly recomended or if you have the nerve tower of terror"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [2] "Its a shame this park carry the name Disney... They build the place years back and they are now just collecting cash without any effort, it does not compare in anything with the American parks which are truly amazing. The whole park now is designed to get money from their guests on every step without offering any services. The staff are truly miserable and seems like they hate their job. In the whole park there were no characters walking around at all so a kid can take a picture. There were 3 characters who a kid could meet in the whole park but you had to wait forever just to take a picture. Many rides where closed and the wait time for anything was 60 minutes or more, who in the world can make a a kid stand still in a waiting line for 60 Minutes or more? Avoid at all costs especially if you have young children. PS. Parking is    30 to park 30 minutes walk from the entrance after spending    75 person for your ticket...just ridiculous "

Taking closer look at the reviews, we noticed that the reviewer who mentioned ‘pariah’ mostly talked about how they were treated like ‘pariah’ in some cases, some suggested otherwise. This might suggest that the customer services might be considered as polarized. For the ‘character’ word, this might suggest that visitors who went to Paris branch pay considerable attention to meeting with the characters in the park.

Same frequent topics in all three branches

hk_sub <- filter(hk_topwords, 
                 word %in% c('park', 'ride', 'wait', 'kid'))%>% mutate(branch = 'hk')

ca_sub <- filter(ca_topwords, 
                 word %in% c('park', 'ride', 'wait', 'kid'))%>% mutate(branch = 'ca')

pr_sub <- filter(pr_topwords, 
                 word %in% c('park', 'ride', 'wait', 'kid'))%>% mutate(branch = 'pr')

same_topwords = rbind(hk_sub, ca_sub, pr_sub)
same_topwords <- same_topwords %>%
  select(word, percentage, branch)

ggplot(same_topwords, aes(x = word, y = percentage, fill = branch)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ branch, ncol = 3) +
  labs(x = "Words", y = "Percentages in %", fill = "Branch") +
  theme(plot.background = element_blank(),
        #text=element_text(size=16),
        strip.text=element_text(size=14),
        axis.text.x=element_text(size=14, angle=45, hjust=1),
        axis.text.y=element_text(size=14),
        legend.position = "none") +
  scale_fill_manual(values = c("hk" = "#EE4242", "ca" = "#28E92C", "pr" = "#4169E2", guide = FALSE)) +
  ggtitle("Same frequent words/topics in all three branches")

These words, ‘kid’, ‘park’, ‘ride’, and ‘wait’ are topics that are mentioned for all three branches. These might suggest that reviewers care about these topics regardless of the branch locations.

Hypothesis 2:

Ho: Hong Kong Disneyland receives reviews related to shopping experiences in similar proportion as other branches.
Ha: Hong Kong Disneyland receives reviews related to shopping experiences in different proportion as other branches.

# examine branch names
unique(disneyland$Branch)
## [1] "Disneyland_HongKong"   "Disneyland_California" "Disneyland_Paris"

Split dataset based on branch

hongkong <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong')
cali <- disneyland %>%
  filter(Branch == 'Disneyland_California')
paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris')

Check document integrity

nrow(disneyland) == nrow(hongkong) + nrow(cali) + nrow(paris)
## [1] TRUE

Create corpus based on branches:

corpus_hk = Corpus(VectorSource(hongkong$Review_Text))
corpus_cali = Corpus(VectorSource(cali$Review_Text))
corpus_paris = Corpus(VectorSource(paris$Review_Text))

Hong Kong

Define shopping-related words:

shopping_words <- c(
  'shop',
  'souvenir',
  'merchandise',
  'collectible',
  'gift',
  'shirt',
  'toy',
  'book',
  'hat',
  'band',
  'art', 'mall', 'cashier', 'cloth', 'checkout', 'cart', 'outlet', 'bag', 'quality', 'boutique'
)

Cleaning texts:

corpus <- corpus_hk
# convert to lower cases
corpus = tm_map(corpus,FUN = content_transformer(tolower))
# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
# stripwhitespaces
corpus = tm_map(corpus,FUN = stripWhitespace)
# stem the corpurs
corpus = tm_map(corpus,FUN = stemDocument)
# convert corpus to document term matrix to get a list of unique words contained in all reviews
dtm_hk <- DocumentTermMatrix(corpus)
# select words unrelated to shopping to be removed
words_remove <- setdiff(colnames(dtm_hk), shopping_words)
length(words_remove)
## [1] 17979
# remove words unrelated to shopping
corpus = tm_map(corpus, FUN = removeWords, words_remove[1:4000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[4001:7000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[7001:10000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[10001:13000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[13001:16000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[16001:17987])
corpus = tm_map(corpus, FUN = removeNumbers)

Transform cleaned corpus to document term matrix

dtm_hk_then <- DocumentTermMatrix(corpus)
colnames(dtm_hk_then)
##  [1] "gift"     "shop"     "souvenir" "book"     "toy"      "cloth"   
##  [7] "hat"      "outlet"   "bag"      "mall"     "cart"     "cashier" 
## [13] "shirt"    "art"      "band"     "checkout"

Make data frame

dtm_hk_then = as.data.frame(as.matrix(dtm_hk_then))
sort(colSums(dtm_hk_then),decreasing = T)
##     shop      toy     book souvenir      bag     gift   outlet     cart 
##      860      738      416      400      194      133      107       78 
##      hat    cloth    shirt      art     band     mall  cashier checkout 
##       76       68       48       24       19       18       16        1

Add information summarizing columns:

# add a column counting the number of shopping related words in a review
dtm_hk_then['shopping_word_count'] = rowSums(dtm_hk_then)
# add a column specifying if a review contains shopping related words
dtm_hk_then['if_shopping_word'] = ifelse(dtm_hk_then$shopping_word_count >= 1, 1, 0)

Attach word frequency to original data frame and select columns

hongkong <- hongkong %>%
  cbind(dtm_hk_then) %>%
  select(-Review_ID, -Reviewer_Location, 
         -Review_Text, -Year_Month, 
         -Year, -Month, -continent)
hongkong['no_shopping_word'] = ifelse(rowSums(hongkong[4:19]) == 0, 1, 0)
hongkong <- hongkong %>%
  select_if(function(x) !all(x == 0))

Calculate average ratings for each shopping words and add counts

shopping_hk = data.frame(rd=names(colSums(hongkong[c(4:22)])))
df <- data.frame()
for (i in 1:nrow(shopping_hk)) { 
  for (k in shopping_hk[i,]) { 
    avg_rating <- hongkong %>%
      filter(hongkong[,k] != 0) %>%
      summarise(avg_rating = mean(Rating))
    df <- rbind(df, avg_rating)
  } 
}
shopping_mention_count <- data.frame(rides = names(colSums(hongkong[c(4:22)])), 
                                 counts = as.numeric(colSums(hongkong[c(4:22)])),
                                 avg_raing = df)
shopping_mention_count
##                  rides counts avg_rating
## 1                 gift    133   4.237705
## 2                 shop    860   4.196906
## 3             souvenir    400   4.185792
## 4                 book    416   4.385445
## 5                  toy    738   4.297638
## 6                cloth     68   4.348485
## 7                  hat     76   4.027397
## 8               outlet    107   3.970588
## 9                  bag    194   4.075949
## 10                mall     18   4.000000
## 11                cart     78   4.144928
## 12             cashier     16   3.545455
## 13               shirt     48   4.066667
## 14                 art     24   4.523810
## 15                band     19   4.277778
## 16            checkout      1   4.000000
## 17 shopping_word_count   3196   4.241515
## 18    if_shopping_word   2033   4.241515
## 19    no_shopping_word   7114   4.208040

Get frequency of each shopping word in total branch reviews & in reviews that mentioned shopping in the branch

freq_all <- shopping_mention_count %>%
  mutate(freq_branch_pct = counts/nrow(hongkong)*100, 
         freq_Mention_pct = counts/counts[17]*100) %>%
  mutate_at(c(3:5), round, digits = 2)%>%
  arrange(desc(freq_Mention_pct))
freq_all
##                  rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1     no_shopping_word   7114       4.21           77.77           222.59
## 2  shopping_word_count   3196       4.24           34.94           100.00
## 3     if_shopping_word   2033       4.24           22.23            63.61
## 4                 shop    860       4.20            9.40            26.91
## 5                  toy    738       4.30            8.07            23.09
## 6                 book    416       4.39            4.55            13.02
## 7             souvenir    400       4.19            4.37            12.52
## 8                  bag    194       4.08            2.12             6.07
## 9                 gift    133       4.24            1.45             4.16
## 10              outlet    107       3.97            1.17             3.35
## 11                cart     78       4.14            0.85             2.44
## 12                 hat     76       4.03            0.83             2.38
## 13               cloth     68       4.35            0.74             2.13
## 14               shirt     48       4.07            0.52             1.50
## 15                 art     24       4.52            0.26             0.75
## 16                band     19       4.28            0.21             0.59
## 17                mall     18       4.00            0.20             0.56
## 18             cashier     16       3.55            0.17             0.50
## 19            checkout      1       4.00            0.01             0.03
Rating analysis

Proportions of reviews that mentioned shopping words & no mention rides in the branch

p_mention_orno <- freq_all[c(1,3), 1:4]
p_mention_orno%>%
  ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  labs(fill = "Shopping Words Mentioned") +
  ggtitle("Reviews that mentioned & did not mention shopping words in Hong Kong Branch") +
  theme_void()+
  geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
  scale_fill_manual(values=c("#F1786C", "#E8CCC7"), 
                    labels=c("Mentioned", "Did not mention"))

Proportions of shopping words mentioned in the branch over total number of reviews in the branch & over reviews that mentioned rides in branch

p_shops <- freq_all[4:19, 1:5]

# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_shops <- p_shops %>%
  filter(freq_Mention_pct >= 2) 

shopping_rank_plot <- hot_shops%>%
  ggplot(aes(x=reorder(rides,freq_Mention_pct), 
             y=freq_Mention_pct, fill=freq_Mention_pct))+
  geom_col(position='dodge')+
  coord_flip()+
  labs(x = "", y = "Frequency in %") +
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(0, 30)) +
  ggtitle("All shopping words mentioned in Hong Kong with % distribution")+
  scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE) +
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.x  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"), 
        plot.title = element_text(size = 20), 
        axis.text.x = element_text(size = 15), 
        axis.text.y = element_text(size = 15))

rating_rank_plot <- hot_shops %>%
  ggplot() +
  geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct), 
           width=0.7, position='dodge')+  
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(-3, 30)) +
  geom_text(aes(x=rides, y=-1.5, label=avg_rating), size = 5) +
  labs(x = "Mostly mentioned shopping words", y = "Frequency in %") +
  ggtitle("Average rating ranked with most popular shopping words & no shopping words") +
  scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE)+
  labs(tag ="Ratings:")+  
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.y  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.tag.position = c(0.01,0.12),
        plot.tag = element_text(hjust = 0, size=13),
        plot.title = element_text(size = 20), 
        axis.text.x = element_text(size = 15), 
        axis.text.y = element_text(size = 13))

#grid.arrange(shopping_rank_plot, rating_rank_plot, ncol = 2)
shopping_rank_plot

rating_rank_plot

From the two graphs above, we can see that words including “shop”, “toy”, “book” and “souvenir” are mentioned most frequently in reviews of the Hong Kong brancn, indicating the most cared aspects of shopping. Products like book, cloth and toy received higher ratings while cart, bag hat received lower ratings also implies customer satisfaction.

California

Cleaning texts:

corpus <- corpus_cali
corpus = tm_map(corpus,FUN = content_transformer(tolower))
corpus = tm_map(corpus,FUN = removePunctuation)
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus = tm_map(corpus,FUN = stemDocument)
dtm_cali <- DocumentTermMatrix(corpus)
words_remove <- setdiff(colnames(dtm_cali), shopping_words)
length(words_remove)
## [1] 25632
corpus = tm_map(corpus, FUN = removeWords, words_remove[1:4000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[4001:7000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[7001:10000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[10001:13000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[13001:16000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[16001:19000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[19001:21000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[21001:24000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[24001:25632])
corpus = tm_map(corpus, FUN = removeNumbers)

Transform cleaned corpus to document term matrix

dtm_cali_then <- DocumentTermMatrix(corpus)
colnames(dtm_cali_then)
##  [1] "book"     "bag"      "shop"     "gift"     "cloth"    "band"    
##  [7] "cart"     "shirt"    "souvenir" "art"      "toy"      "hat"     
## [13] "outlet"   "cashier"  "mall"     "checkout"

Make data frame

dtm_cali_then = as.data.frame(as.matrix(dtm_cali_then))
sort(colSums(dtm_cali_then),decreasing = T)
##     shop     book      bag souvenir      toy     gift      hat     band 
##      864      777      392      308      189      173      168      139 
##    cloth     cart    shirt      art   outlet  cashier     mall checkout 
##      123      123       98       87       53       21       16        4

Add information summarizing columns:

# add a column counting the number of shopping related words in a review
dtm_cali_then['shopping_word_count'] = rowSums(dtm_cali_then)
# add a column specifying if a review contains shopping related words
dtm_cali_then['if_shopping_word'] = ifelse(dtm_cali_then$shopping_word_count >= 1, 1, 0)

Attach word frequency to original data frame and select columns

cali <- cali %>%
  cbind(dtm_cali_then) %>%
  select(-Review_ID, -Reviewer_Location, 
         -Review_Text, -Year_Month, 
         -Year, -Month, -continent)
cali['no_shopping_word'] = ifelse(rowSums(cali[4:19]) == 0, 1, 0)
cali <- cali %>%
  select_if(function(x) !all(x == 0))

Calculate average ratings for each shopping words and add counts

shopping_ca = data.frame(rd=names(colSums(cali[c(4:22)])))
# average rating for each shopping word
df <- data.frame()
for (i in 1:nrow(shopping_ca)) { 
  for (k in shopping_ca[i,]) { 
    avg_rating <- cali %>%
      filter(cali[,k] != 0) %>%
      summarise(avg_rating = mean(Rating))
    df <- rbind(df, avg_rating)
  } 
}
# shopping words, counts, and average ratings
shopping_mention_count <- data.frame(rides = names(colSums(cali[c(4:22)])), 
                                     counts = as.numeric(colSums(cali[c(4:22)])),
                                     avg_raing = df)
shopping_mention_count
##                  rides counts avg_rating
## 1                 book    777   4.286441
## 2                  bag    392   4.003247
## 3                 shop    864   4.410397
## 4                 gift    173   4.416667
## 5                cloth    123   4.385965
## 6                 band    139   4.413223
## 7                 cart    123   4.037383
## 8                shirt     98   4.116279
## 9             souvenir    308   4.117438
## 10                 art     87   4.535211
## 11                 toy    189   4.260606
## 12                 hat    168   4.493151
## 13              outlet     53   4.060000
## 14             cashier     21   3.684211
## 15                mall     16   3.937500
## 16            checkout      4   4.500000
## 17 shopping_word_count   3535   4.267581
## 18    if_shopping_word   2332   4.267581
## 19    no_shopping_word  15870   4.435161

Get frequency of each shopping word in total branch reviews & in reviews that mentioned shopping in the branch

freq_all <- shopping_mention_count %>%
  mutate(freq_branch_pct = counts/nrow(cali)*100, 
         freq_Mention_pct = counts/counts[17]*100) %>%
  mutate_at(c(3:5), round, digits = 2)%>%
  arrange(desc(freq_Mention_pct)) 
freq_all
##                  rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1     no_shopping_word  15870       4.44           87.19           448.94
## 2  shopping_word_count   3535       4.27           19.42           100.00
## 3     if_shopping_word   2332       4.27           12.81            65.97
## 4                 shop    864       4.41            4.75            24.44
## 5                 book    777       4.29            4.27            21.98
## 6                  bag    392       4.00            2.15            11.09
## 7             souvenir    308       4.12            1.69             8.71
## 8                  toy    189       4.26            1.04             5.35
## 9                 gift    173       4.42            0.95             4.89
## 10                 hat    168       4.49            0.92             4.75
## 11                band    139       4.41            0.76             3.93
## 12               cloth    123       4.39            0.68             3.48
## 13                cart    123       4.04            0.68             3.48
## 14               shirt     98       4.12            0.54             2.77
## 15                 art     87       4.54            0.48             2.46
## 16              outlet     53       4.06            0.29             1.50
## 17             cashier     21       3.68            0.12             0.59
## 18                mall     16       3.94            0.09             0.45
## 19            checkout      4       4.50            0.02             0.11
Rating Analysis
p_mention_orno <- freq_all[c(1,3), 1:4]
p_mention_orno%>%
  ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  labs(fill = "Shopping Words Mentioned") +
  ggtitle("Reviews that mentioned & did not mention shopping words in California Branch") +
  theme_void()+
  geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
  scale_fill_manual(values=c("#00A632", "#D6F1DF"), 
                    labels=c("Mentioned", "Did not mention"))

Proportions of shopping words mentioned in the branch over total number of reviews in the branch & over reviews that mentioned rides in branch

p_shops <- freq_all[4:19, 1:5]

# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_shops <- p_shops %>%
  filter(freq_Mention_pct >= 2) 

shopping_rank_plot <- hot_shops%>%
  ggplot(aes(x=reorder(rides,freq_Mention_pct), 
             y=freq_Mention_pct, fill=freq_Mention_pct))+
  geom_col(position='dodge')+
  coord_flip()+
  labs(x = "", y = "Frequency in %") +
  ggtitle("All shopping words mentioned in California with % distribution")+
  scale_fill_gradient(low ="#D6F1DF", high = "#00A632", guide = FALSE) +
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.x  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.title = element_text(size = 20), 
        axis.text.x = element_text(size = 15), 
        axis.text.y = element_text(size = 13))

rating_rank_plot <- hot_shops %>%
  ggplot() +
  geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct), 
           width=0.7, position='dodge')+  
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(-3, 25)) +
  geom_text(aes(x=rides, y=-1.2, label=avg_rating), size = 5) +
  labs(x = "Mostly mentioned shopping words", y = "Frequency in %") +
  ggtitle("Average rating ranked with most popular shopping words & no shopping words") +
  scale_fill_gradient(low ="#D6F1DF", high = "#00A632", guide = FALSE)+
  labs(tag ="Ratings:")+  
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.y  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.tag.position = c(0.005,0.135),
        plot.tag = element_text(hjust = 0, size=14),
        plot.title = element_text(size = 20), 
        axis.text.x = element_text(size = 15), 
        axis.text.y = element_text(size = 13))

#grid.arrange(shopping_rank_plot, rating_rank_plot, ncol = 2)
shopping_rank_plot

rating_rank_plot

From the two graph above, we can observe that “shop”, “book”, “bag” and “souvenir” are mentioned most frequently. Art, hat, gift and band receive higher ratings on average, but more frequently mentioned products and words received relatively lower rating.

Paris

Cleaning texts:

corpus <- corpus_paris
corpus = tm_map(corpus,FUN = content_transformer(tolower))
corpus = tm_map(corpus,FUN = removePunctuation)
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus = tm_map(corpus,FUN = stripWhitespace)
#dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
#                     lowfreq = 0)
#dict_corpus = Corpus(VectorSource(dict))
corpus = tm_map(corpus,FUN = stemDocument)
dtm_hk <- DocumentTermMatrix(corpus)
words_remove <- setdiff(colnames(dtm_hk), shopping_words)
length(words_remove)
## [1] 32589
corpus = tm_map(corpus, FUN = removeWords, words_remove[1:4000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[4001:7000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[7001:10000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[10001:13000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[13001:16000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[16001:19000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[19001:21000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[21001:24000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[24001:27000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[27001:30000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[30001:32589])
corpus = tm_map(corpus, FUN = removeNumbers)

Transform cleaned corpus to document term matrix

dtm_paris_then <- DocumentTermMatrix(corpus)
colnames(dtm_paris_then)
##  [1] "outlet"   "book"     "shop"     "hat"      "gift"     "cart"    
##  [7] "souvenir" "shirt"    "toy"      "mall"     "bag"      "band"    
## [13] "cloth"    "cashier"  "art"      "checkout"

Make data frame

dtm_paris_then = as.data.frame(as.matrix(dtm_paris_then))
sort(colSums(dtm_paris_then),decreasing = T)
##     book     shop      bag      toy souvenir   outlet     gift      hat 
##     2299     2042      579      340      298      259      243      128 
##    cloth     cart     band    shirt     mall  cashier      art checkout 
##      124      103       67       53       49       44       41        8

Add information summarizing columns:

# add a column counting the number of shopping related words in a review
dtm_paris_then['shopping_word_count'] = rowSums(dtm_paris_then)
# add a column specifying if a review contains shopping related words
dtm_paris_then['if_shopping_word'] = ifelse(dtm_paris_then$shopping_word_count >= 1, 1, 0)

Attach word frequency to original data frame and select columns

paris <- paris %>%
  cbind(dtm_paris_then) %>%
  select(-Review_ID, -Reviewer_Location, 
         -Review_Text, -Year_Month, 
         -Year, -Month, -continent)
paris['no_shopping_word'] = ifelse(rowSums(paris[4:19]) == 0, 1, 0)
paris <- paris %>%
  select_if(function(x) !all(x == 0))

Calculate average ratings for each shopping words and add counts

shopping_paris = data.frame(rd=names(colSums(paris[c(4:22)])))
# average rating for each shopping word
df <- data.frame()
for (i in 1:nrow(shopping_paris)) { 
  for (k in shopping_paris[i,]) { 
    avg_rating <- paris %>%
      filter(paris[,k] != 0) %>%
      summarise(avg_rating = mean(Rating))
    df <- rbind(df, avg_rating)
  } 
}
# shopping words, counts, and average ratings
shopping_mention_count <- data.frame(rides = names(colSums(paris[c(4:22)])), 
                                     counts = as.numeric(colSums(paris[c(4:22)])),
                                     avg_raing = df)
shopping_mention_count
##                  rides counts avg_rating
## 1               outlet    259   3.530172
## 2                 book   2299   3.844173
## 3                 shop   2042   3.903348
## 4                  hat    128   4.061947
## 5                 gift    243   3.905830
## 6                 cart    103   3.608696
## 7             souvenir    298   4.059041
## 8                shirt     53   3.653061
## 9                  toy    340   3.858108
## 10                mall     49   3.195652
## 11                 bag    579   3.745455
## 12                band     67   4.134615
## 13               cloth    124   3.965812
## 14             cashier     44   3.050000
## 15                 art     41   3.975000
## 16            checkout      8   3.500000
## 17 shopping_word_count   6677   3.840624
## 18    if_shopping_word   3589   3.840624
## 19    no_shopping_word   9105   4.035695

Get frequency of each shopping word in total branch reviews & in reviews that mentioned shopping in the branch

freq_all <- shopping_mention_count %>%
  mutate(freq_branch_pct = counts/nrow(paris)*100, 
         freq_Mention_pct = counts/counts[17]*100) %>%
  mutate_at(c(3:5), round, digits = 2)%>%
  arrange(desc(freq_Mention_pct)) 
freq_all
##                  rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1     no_shopping_word   9105       4.04           71.73           136.36
## 2  shopping_word_count   6677       3.84           52.60           100.00
## 3     if_shopping_word   3589       3.84           28.27            53.75
## 4                 book   2299       3.84           18.11            34.43
## 5                 shop   2042       3.90           16.09            30.58
## 6                  bag    579       3.75            4.56             8.67
## 7                  toy    340       3.86            2.68             5.09
## 8             souvenir    298       4.06            2.35             4.46
## 9               outlet    259       3.53            2.04             3.88
## 10                gift    243       3.91            1.91             3.64
## 11                 hat    128       4.06            1.01             1.92
## 12               cloth    124       3.97            0.98             1.86
## 13                cart    103       3.61            0.81             1.54
## 14                band     67       4.13            0.53             1.00
## 15               shirt     53       3.65            0.42             0.79
## 16                mall     49       3.20            0.39             0.73
## 17             cashier     44       3.05            0.35             0.66
## 18                 art     41       3.98            0.32             0.61
## 19            checkout      8       3.50            0.06             0.12
Rating Analysis
p_mention_orno <- freq_all[c(1,3), 1:4]
p_mention_orno%>%
  ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  labs(fill = "Shopping Words Mentioned") +
  ggtitle("Reviews that mentioned & did not mention shopping words in Paris Branch") +
  theme_void()+
  geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
  scale_fill_manual(values=c("#4169E2", "#D4DFFF"), 
                    labels=c("Mentioned", "Did not mention"))

Proportions of shopping words mentioned in the branch over total number of reviews in the branch & over reviews that mentioned rides in branch

p_shops <- freq_all[4:19, 1:5]

# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_shops <- p_shops %>%
  filter(freq_Mention_pct >= 2) 

shopping_rank_plot <- hot_shops%>%
  ggplot(aes(x=reorder(rides,freq_Mention_pct), 
             y=freq_Mention_pct, fill=freq_Mention_pct))+
  geom_col(position='dodge')+
  coord_flip()+
  labs(x = "", y = "Frequency in %") +
  ggtitle("All shopping words mentioned in Paris with % distribution")+
  scale_fill_gradient(low ="#D4DFFF", high = "#4169E2", guide = FALSE) +
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(0, 40)) +
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.x  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.title = element_text(size = 20), 
        axis.text.x = element_text(size = 15), 
        axis.text.y = element_text(size = 15))

rating_rank_plot <- hot_shops %>%
  ggplot() +
  geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct), 
           width=0.7, position='dodge')+  
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(-4.5, 40)) +
  geom_text(aes(x=rides, y=-2.5, label=avg_rating), size = 5) +
  labs(x = "Mostly mentioned shopping words", y = "Frequency in %") +
  ggtitle("Average rating ranked with most popular shopping words & no shopping words") +
  scale_fill_gradient(low ="#D4DFFF", high = "#4169E2", guide = FALSE)+
  labs(tag ="Ratings:")+  
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.y  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.tag.position = c(0.01,0.125),
        plot.tag = element_text(hjust = 0, size=15),
        plot.title = element_text(size = 20), 
        axis.text.x = element_text(size = 15), 
        axis.text.y = element_text(size = 13))

# grid.arrange(shopping_rank_plot, rating_rank_plot, ncol = 2)
shopping_rank_plot

rating_rank_plot

From the visualizations, “book” and “shop” are the leading words mentioned in the reviews, with “bag” and “toy” followed. Generally these shopping-related words received lower rating in Paris branch compared to the other two with souvenir and gift most favored by reviewers.

Sentiment Analysis

Hong Kong

Integrate the three branches and attach to original dataset

# rbind all 3 branches 
tri_branch <- rbind(hongkong, cali, paris)
# cbind indicators to original dataset
disneyland <- cbind(disneyland, tri_branch[, 21])
colnames(disneyland)[11] <- 'if_shopping_words'

Distribution of sentiment scores with shopping words

afinn<-read.csv("afinn.csv")
dis_w_shop <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)%>%
  summarize(reviewSentiment = mean(value))%>%
  ungroup()%>%
  ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Hongkong Branch (with shopping words)")
dis_w_shop

Distribution of sentiment scores without shopping words

dis_wt_shop <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)%>%
  summarize(reviewSentiment = mean(value))%>%
  ungroup()%>%
  ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Hongkong Branch (without shopping words)")
dis_wt_shop

California

Distribution of sentiment scores with shopping words

dis_w_shop <- disneyland %>%
  filter(Branch == 'Disneyland_California', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)%>%
  summarize(reviewSentiment = mean(value))%>%
  ungroup()%>%
  ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in California Branch (with shopping words)")
dis_w_shop

Distribution of sentiment scores without shopping words

dis_wt_shop <- disneyland %>%
  filter(Branch == 'Disneyland_California', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)%>%
  summarize(reviewSentiment = mean(value))%>%
  ungroup()%>%
  ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in California Branch (without shopping words)")
dis_wt_shop

Paris

Distribution of sentiment scores with shopping words

dis_w_shop <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)%>%
  summarize(reviewSentiment = mean(value))%>%
  ungroup()%>%
  ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Paris Branch (with shopping words)")
dis_w_shop

Distribution of sentiment scores without shopping words

dis_wt_shop <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)%>%
  summarize(reviewSentiment = mean(value))%>%
  ungroup()%>%
  ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Paris Branch (without shopping words)")
dis_wt_shop

Overall Sentiment Analytis

Calculate proportions and average scores required

##### hong kong#####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / n())

#  proportion of positive sentiment without shopping words
pos_prop_wt_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / n())

# average sentiment score with shopping words
mean_w_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

# average sentiment score without shopping words
mean_wt_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))


##### california #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / n())

#  proportion of positive sentiment without shopping words
pos_prop_wt_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / n())

# average sentiment score with shopping words
mean_w_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

# average sentiment score without shopping words
mean_wt_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))



##### paris #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / n())

#  proportion of positive sentiment without shopping words
pos_prop_wt_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / n())

# average sentiment score with shopping words
mean_w_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', if_shopping_words == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

# average sentiment score without shopping words
mean_wt_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', if_shopping_words == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

Proportion of positive words in reviews w/wt shopping words

# make proportion plot
df_prop = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
                     prop_pos_words_w_shop = c(as.numeric(pos_prop_w_shop_hk), as.numeric(pos_prop_w_shop_ca), as.numeric(pos_prop_w_shop_paris)),
                     prop_pos_words_wt_shop = c(as.numeric(pos_prop_wt_shop_hk), as.numeric(pos_prop_wt_shop_ca), as.numeric(pos_prop_wt_shop_paris)))
df_prop_long <- tidyr::pivot_longer(df_prop, cols = c("prop_pos_words_w_shop", "prop_pos_words_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_prop_long, aes(x = branch, y = proportion, fill = category)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
  scale_fill_manual(values = c("#0072B2", "#F0E442"), 
                    labels=c("Mentioned", "Did not mention")) +
  labs(title = "Proportion of Positive Words with and without Shopping words", x = "Branch", y = "Proportion", fill = 'Shopping words mentioned') +
  geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 14, face = "bold"),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12))

Average sentiment scores of reviews w/wt shopping words

# make average score plot
df_mean = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
                     mean_w_shop = c(as.numeric(mean_w_shop_hk), as.numeric(mean_w_shop_ca), as.numeric(mean_w_shop_paris)),
                     mean_wt_shop = c(as.numeric(mean_wt_shop_hk), as.numeric(mean_wt_shop_ca), as.numeric(mean_wt_shop_paris)))
df_mean_long <- tidyr::pivot_longer(df_mean, cols = c("mean_w_shop", "mean_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_mean_long, aes(x = branch, y = proportion, fill = category)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
  scale_fill_manual(values = c("#0072B2", "#F0E442"), 
                    labels=c("Mentioned", "Did not mention")) +
  labs(title = "Average sentiment scores with and without Shopping words", x = "Branch", y = "Average sentiment score", fill = 'Shopping words mentioned') +
  geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 14, face = "bold"),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12))

From the proportion and average sentiment score graphs presented above, we observe that among all 3 branches, more positive words are in reviews unrelated to shopping experience, and reviews unrelated to shopping experience tend to have more positive tone than reviews that talked about shopping. We can thus conclude that it might be a common problem that Disneyland as a whole has to do better in improving travelers’ shopping experience.

Hypothesis 3:

Ho: California Disneyland receives reviews related to rides experiences in similar proportion as other branches.
Ha: California Disneyland receives reviews related to rides experiences in different proportion as other branches.

Get ride names
rides_name <- rides$Ride_name
#head(rides_name)

We matched ride features with reviews mentioned the ride, which is explained in detail in research question no.3.

#rides_name[1:14]
disneyland_ride <- disneyland %>%
  mutate(Astro_Orbiter = case_when(grepl(c("astro orbiter", "Astro", "Orbiter"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Avatar_Flight = case_when(grepl(c("Avatar Flight of Passage", "Avatar Flight", "Avatar Ride"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Big_Thunder = case_when(grepl(c("Big Thunder Mountain Railroad", "Big Thunder"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Buzz_Lightyear = case_when(grepl(c("Buzz Lightyear's Space Ranger Spin", "Buzz Lightyear's", "Space Ranger Spin", "Space Ranger"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Dinosaur = case_when(grepl(c("Dinosaur"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Dumbo = case_when(grepl(c("Dumbo the Flying Elephant", "Flying Elephant"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Expedition_Everest = case_when(grepl(c("Expedition Everest"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Frozen_Ever = case_when(grepl(c("Frozen Ever After", "Ever After"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Gran_Fiesta = case_when(grepl(c("Gran Fiesta Tour Starring The Three Caballeros", "Gran Fiesta", "Starring The Three Caballeros", "Three Caballeros"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Haunted_Mansion = case_when(grepl(c("Haunted Mansion"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Small_World= case_when(grepl(c("It's a Small World", "Small World"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Journey_Into= case_when(grepl(c("Journey Into Imagination with Figment", "Imagination with Figment"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Jungle_Cruise= case_when(grepl(c("Jungle Cruise"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0)) %>%
  mutate(Kali_River = case_when(grepl(c("Kali River Rapids", "Kali River"), Review_Text, ignore.case = TRUE) ~ 1, 
                                   TRUE ~ 0))

Because some of the rides only exists in the Orlando Disney, so we want to exclude the column associated with rides not being mentioned in any of the Review_Text.

# rides mentioned in the Disneyland dataset: 27-3 = 24 rides. There are 40043 reviews. dropped other columns
disneyland_ride2 <- disneyland_ride %>% 
  select_if(function(x) !all(x == 0)) %>% 
  select(c(2,6,9,11:34))

# disneyland_ride2: 3 basic info + 24 rides + 2 mention columns. 29 cols, 40043 reviews
disneyland_ride2$mention_ride <-  ifelse(rowSums(disneyland_ride2[c(4:27)]) == 0, 0, 1)
disneyland_ride2$no_mention_ride <-  ifelse(rowSums(disneyland_ride2[c(4:27)]) == 0, 1, 0)
#head(disneyland_ride2)

Split dataset based on branch for those mentioned at least one ride

# also remove ride columns that are not in branch
hongkong <- disneyland_ride2 %>%
  filter(Branch == 'Disneyland_HongKong') %>% 
  select_if(function(x) !all(x == 0))     # HK: 1:3(info) + 4:19(16 rides) + 20:21(mention cols)
cali <- disneyland_ride2 %>%
  filter(Branch == 'Disneyland_California') %>% 
  select_if(function(x) !all(x == 0))     # CA: 1:3(info) + 4:23(20 rides) + 24:25(mention cols)
paris <- disneyland_ride2 %>%
  filter(Branch == 'Disneyland_Paris') %>% 
  select_if(function(x) !all(x == 0))     # PR: 1:3(info) + 4:20(17 rides) + 21:22(mention cols)

We structured the columns to be: 3 columns of basic information, ride dummy variables, and 2 columns indicating whether the line of the review mentioned rides or not.
hongkong: 3 info + 16 rides + 2 mention columns for Hong Kong
cali: 3 info + 20 rides + 2 mention columns for California
paris: 3 info + 17 rides + 2 mention columns for Paris

Hong Kong
#head(hongkong)
rides_hk = data.frame(rd=names(colSums(hongkong[c(4:21)])))
rides_hk       # HK: 4:19(16 rides) + 20:21(2 mention cols)
##                       rd
## 1            Big_Thunder
## 2               Dinosaur
## 3                  Dumbo
## 4     Expedition_Everest
## 5        Haunted_Mansion
## 6            Small_World
## 7          Jungle_Cruise
## 8                Pirates
## 9           Seven_Dwarfs
## 10        Space_Mountain
## 11       Splash_Mountain
## 12            Star_Tours
## 13           Winnie_Pooh
## 14 Tomorrowland_Speedway
## 15             Toy_Story
## 16             Under_Sea
## 17          mention_ride
## 18       no_mention_ride
colSums(hongkong[c(4:21)])
##           Big_Thunder              Dinosaur                 Dumbo 
##                     6                     1                     6 
##    Expedition_Everest       Haunted_Mansion           Small_World 
##                    10                    94                   179 
##         Jungle_Cruise               Pirates          Seven_Dwarfs 
##                    85                    29                     1 
##        Space_Mountain       Splash_Mountain            Star_Tours 
##                   785                    17                    21 
##           Winnie_Pooh Tomorrowland_Speedway             Toy_Story 
##                    20                     1                     3 
##             Under_Sea          mention_ride       no_mention_ride 
##                     1                  1008                  8139
# get the avg rating for each ride
df <- data.frame()
for (i in 1:nrow(rides_hk)) { 
  for (k in rides_hk[i,]) { 
    avg_rating <- hongkong %>%
        filter(hongkong[,k] != 0) %>%
        summarise(avg_rating = mean(Rating))
    df <- rbind(df, avg_rating)
  } 
}

# ride names, counts, and average ratings
ride_mention_count <- data.frame(rides = names(colSums(hongkong[c(4:21)])), 
                                 counts = as.numeric(colSums(hongkong[c(4:21)])),
                                 avg_raing = df)

# Frequency of each ride in total branch reviews & in reviews that mentioned rides in the branch
freq_all <- ride_mention_count %>%
  mutate(freq_branch_pct = counts/nrow(hongkong)*100, 
         freq_Mention_pct = counts/counts[17]*100) %>%
  mutate_at(c(3:5), round, digits = 2)%>%
  arrange(desc(freq_Mention_pct)) 
freq_all
##                    rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1        no_mention_ride   8139       4.21           88.98           807.44
## 2           mention_ride   1008       4.26           11.02           100.00
## 3         Space_Mountain    785       4.26            8.58            77.88
## 4            Small_World    179       4.35            1.96            17.76
## 5        Haunted_Mansion     94       4.31            1.03             9.33
## 6          Jungle_Cruise     85       4.25            0.93             8.43
## 7                Pirates     29       3.76            0.32             2.88
## 8             Star_Tours     21       4.29            0.23             2.08
## 9            Winnie_Pooh     20       4.15            0.22             1.98
## 10       Splash_Mountain     17       3.71            0.19             1.69
## 11    Expedition_Everest     10       4.20            0.11             0.99
## 12           Big_Thunder      6       4.50            0.07             0.60
## 13                 Dumbo      6       4.50            0.07             0.60
## 14             Toy_Story      3       4.67            0.03             0.30
## 15              Dinosaur      1       3.00            0.01             0.10
## 16          Seven_Dwarfs      1       4.00            0.01             0.10
## 17 Tomorrowland_Speedway      1       3.00            0.01             0.10
## 18             Under_Sea      1       4.00            0.01             0.10
# proportions of reviews that mentioned rides & no mention rides in the branch
p_mention_orno <- freq_all[1:2, 1:4]
p_mention_orno    
##             rides counts avg_rating freq_branch_pct
## 1 no_mention_ride   8139       4.21           88.98
## 2    mention_ride   1008       4.26           11.02
p_mention_orno%>%
  ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  labs(fill = "Ride Experience Mentioned") +
  ggtitle("Reviews that mentioned & did not mention ride experiences in Hong Kong") +
  theme_void()+
  geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
  scale_fill_manual(values=c("#F1786C", "#E8CCC7"), 
                    labels=c("Mentioned", "Did not mention"))

88.98% did not mention ride experiences in HK branch with an average rating of 4.21; 11.02% mentioned ride experiences in HK branch with a slightly higher average rating of 4.26.

# proportions of rides mentioned in the branch
# over total number of reviews in the branch & over reviews that mentioned rides in branch
p_rides <- freq_all[3:18, 1:5]

# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_rides <- p_rides %>%
  filter(freq_Mention_pct >= 2) 
hot_rides
##             rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1  Space_Mountain    785       4.26            8.58            77.88
## 2     Small_World    179       4.35            1.96            17.76
## 3 Haunted_Mansion     94       4.31            1.03             9.33
## 4   Jungle_Cruise     85       4.25            0.93             8.43
## 5         Pirates     29       3.76            0.32             2.88
## 6      Star_Tours     21       4.29            0.23             2.08
# 
ride_rank_plot <- hot_rides%>%
  ggplot(aes(x=reorder(rides,freq_Mention_pct), 
             y=freq_Mention_pct, fill=freq_Mention_pct))+
  geom_col(position='dodge')+
  coord_flip()+
  labs(x = "", y = "Frequency in %") +
  ggtitle("All rides mentioned in Hong Kong with % distribution")+
  scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE) +
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.x  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.title = element_text(size = 15, hjust = -0.267, vjust = 1)) 

# rating  
rating_rank_plot <- hot_rides %>%
  ggplot() +
  geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct), 
           width=0.7, position='dodge')+  
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(-10, 80)) +
  geom_text(aes(x=rides, y=-5, label=avg_rating), size = 4.5) +
  labs(x = "Mostly mentioned rides", y = "Frequency in %") +
  ggtitle("Average rating ranked with popular rides & no ride") +
  scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE)+
  labs(tag ="Ratings:")+  
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.y  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.tag.position = c(0.01,0.115),
        plot.tag = element_text(hjust = 0, size=12.5),
        plot.title = element_text(size = 15)) + 
  geom_vline(xintercept = 5.5, linetype = "dashed") +
  geom_text(aes(label = "No rides mentioned", x = 5.5, y=50), angle = 45, color = "#7C7C7C", size = 4)+
  geom_text(aes(label = "4.21", x = 5.5, y=-5), size = 4)



grid.arrange(ride_rank_plot, rating_rank_plot, ncol = 2)

From the plot above, We can see that the mostly mentioned ride in Hong Kong branch is Space_Mountain ride, following by Small_World and Haunted_Mansion. Reviews who mentioned Small_World, Haunted_Mansion, and Star_Tours tend to give higher ratings on average. Space_Mountain, the ride that significantly more people mentioned about, obtained an average ratings that is only slightly higher than those that did not mention any rides. Pirates ride received lower rating than those did not mention rides, which might suggest that visitors might have bad impressions on this ride and caused them to leave a relatively neutral or negative ratings.

California
#head(cali)
rides_ca = data.frame(rd=names(colSums(cali[c(4:25)])))
rides_ca   # CA: 1:3(info) + 4:23(20 rides) + 24:25(2 mention cols)
##                    rd
## 1       Astro_Orbiter
## 2         Big_Thunder
## 3            Dinosaur
## 4               Dumbo
## 5  Expedition_Everest
## 6     Haunted_Mansion
## 7         Small_World
## 8       Jungle_Cruise
## 9             Mad_Tea
## 10          Peter_Pan
## 11            Pirates
## 12      Soarin_Around
## 13     Space_Mountain
## 14    Splash_Mountain
## 15         Star_Tours
## 16         Test_Track
## 17        Winnie_Pooh
## 18      Twilight_Zone
## 19          Toy_Story
## 20          Under_Sea
## 21       mention_ride
## 22    no_mention_ride
colSums(cali[c(4:25)])
##      Astro_Orbiter        Big_Thunder           Dinosaur              Dumbo 
##                  8                 85                 27                  5 
## Expedition_Everest    Haunted_Mansion        Small_World      Jungle_Cruise 
##                  4                828                401                244 
##            Mad_Tea          Peter_Pan            Pirates      Soarin_Around 
##                 23                 48                663                  4 
##     Space_Mountain    Splash_Mountain         Star_Tours         Test_Track 
##               1439                840                548                 10 
##        Winnie_Pooh      Twilight_Zone          Toy_Story          Under_Sea 
##                  4                  7                 20                 15 
##       mention_ride    no_mention_ride 
##               3079              15123
# get the avg rating for each ride
df <- data.frame()
for (i in 1:nrow(rides_ca)) { 
  for (k in rides_ca[i,]) { 
    avg_rating <- cali %>%
        filter(cali[,k] != 0) %>%
        summarise(avg_rating = mean(Rating))
    df <- rbind(df, avg_rating)
  } 
}

# ride names, counts, and average ratings
ride_mention_count <- data.frame(rides = names(colSums(cali[c(4:25)])), 
                                 counts = as.numeric(colSums(cali[c(4:25)])),
                                 avg_raing = df)

# Frequency of each ride in total branch reviews & in reviews that mentioned rides in the branch
freq_all <- ride_mention_count %>%
  mutate(freq_branch_pct = counts/nrow(cali)*100, 
         freq_Mention_pct = counts/counts[21]*100) %>%
  mutate_at(c(3:5), round, digits = 2)%>%
  arrange(desc(freq_Mention_pct)) 
freq_all
##                 rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1     no_mention_ride  15123       4.43           83.08           491.17
## 2        mention_ride   3079       4.36           16.92           100.00
## 3      Space_Mountain   1439       4.37            7.91            46.74
## 4     Splash_Mountain    840       4.38            4.61            27.28
## 5     Haunted_Mansion    828       4.43            4.55            26.89
## 6             Pirates    663       4.42            3.64            21.53
## 7          Star_Tours    548       4.52            3.01            17.80
## 8         Small_World    401       4.36            2.20            13.02
## 9       Jungle_Cruise    244       4.37            1.34             7.92
## 10        Big_Thunder     85       4.59            0.47             2.76
## 11          Peter_Pan     48       4.31            0.26             1.56
## 12           Dinosaur     27       4.37            0.15             0.88
## 13            Mad_Tea     23       4.78            0.13             0.75
## 14          Toy_Story     20       4.25            0.11             0.65
## 15          Under_Sea     15       4.40            0.08             0.49
## 16         Test_Track     10       4.30            0.05             0.32
## 17      Astro_Orbiter      8       4.62            0.04             0.26
## 18      Twilight_Zone      7       4.43            0.04             0.23
## 19              Dumbo      5       4.80            0.03             0.16
## 20 Expedition_Everest      4       4.50            0.02             0.13
## 21      Soarin_Around      4       4.50            0.02             0.13
## 22        Winnie_Pooh      4       5.00            0.02             0.13
# proportions of reviews that mentioned rides & no mention rides in the branch
p_mention_orno <- freq_all[1:2, 1:4]
p_mention_orno    
##             rides counts avg_rating freq_branch_pct
## 1 no_mention_ride  15123       4.43           83.08
## 2    mention_ride   3079       4.36           16.92
p_mention_orno%>%
  ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  labs(fill = "Ride Experience Mentioned") +
  ggtitle("Reviews that mentioned & did not mention ride experiences in California") +
  theme_void()+
  geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
  scale_fill_manual(values=c("#19B34C", "#D6F1DF"), 
                    labels=c("Mentioned", "Did not mention"))

83.08% did not mention ride experiences in CA branch with an average rating of 4.43; 16.92% mentioned ride experiences in CA branch with a slightly lower average rating of 4.36. This finding is different from What we had in HK branch, where the average rating is slightly higher for reviews that mentioned raide experiences.

# proportions of rides mentioned in the branch
# over total number of reviews in the branch & over reviews that mentioned rides in branch
p_rides <- freq_all[3:22, 1:5]

# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_rides <- p_rides %>%
  filter(freq_Mention_pct >= 2) 
hot_rides
##             rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1  Space_Mountain   1439       4.37            7.91            46.74
## 2 Splash_Mountain    840       4.38            4.61            27.28
## 3 Haunted_Mansion    828       4.43            4.55            26.89
## 4         Pirates    663       4.42            3.64            21.53
## 5      Star_Tours    548       4.52            3.01            17.80
## 6     Small_World    401       4.36            2.20            13.02
## 7   Jungle_Cruise    244       4.37            1.34             7.92
## 8     Big_Thunder     85       4.59            0.47             2.76
#  
ride_rank_plot <- hot_rides%>%
  ggplot(aes(x=reorder(rides,freq_Mention_pct), 
             y=freq_Mention_pct, fill=freq_Mention_pct))+
  geom_col(position='dodge')+
  coord_flip()+
  labs(x = "", y = "Frequency in %") +
  ggtitle("All rides mentioned in California with % distribution")+
  scale_fill_gradient(low = "#D6F1DF", high = "#19B34C", guide = FALSE) +
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.x  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.title = element_text(size = 15, hjust = -0.267, vjust = 1)) 

# 
rating_rank_plot <- hot_rides %>%
  ggplot() +
  geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct), 
           width=0.7, position='dodge')+  
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(-10, 80)) +
  geom_text(aes(x=rides, y=-5, label=avg_rating), size = 4.5) +
  labs(x = "Mostly mentioned rides", y = "Frequency in %") +
  ggtitle("Average rating ranked with popular rides & no ride") +
  scale_fill_gradient(low = "#D6F1DF", high = "#19B34C", guide = FALSE)+
  labs(tag ="Ratings:")+  
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.y  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.tag.position = c(0.01,0.108),
        plot.tag = element_text(hjust = 0, size=12.5),
        plot.title = element_text(size = 15)) + 
  geom_vline(xintercept = 3, linetype = "dashed") +
  geom_text(aes(label = "No rides mentioned", x = 3, y=50), angle = 45, color = "#7C7C7C", size = 4)+
  geom_text(aes(label = " ", x = 5.5, y=-5), size = 5)



grid.arrange(ride_rank_plot, rating_rank_plot, ncol = 2)

From the plot above, We can see that the mostly mentioned ride in California branch is also Space_Mountain ride, which is the same as in Hong Kong branch, following by Splash_Mountain and Haunted_Mansion. However, when it comes to average ratings, we found that in California, rides that people tend to mention more about did not lead to higher ratings. The rating even tend to be lower than those did not mention any rides. The mostly mentioned ride, Space_Mountain, received an average rating of 4.37. Although this average rating is slightly higher than the same ride in Hong Kong branch, it is even lower than the ratings that did not mention rides in California. This might suggest that consumers are not entirely happy about their general ride experiences.

Paris
#head(paris)
rides_pr = data.frame(rd=names(colSums(paris[c(4:22)])))
rides_pr    # PR: 4:20(17 rides) + 21:22(2 mention cols)
##                 rd
## 1    Astro_Orbiter
## 2      Big_Thunder
## 3         Dinosaur
## 4            Dumbo
## 5  Haunted_Mansion
## 6      Small_World
## 7    Jungle_Cruise
## 8    Mission_Space
## 9        Peter_Pan
## 10         Pirates
## 11     Rock_Roller
## 12  Space_Mountain
## 13 Splash_Mountain
## 14      Star_Tours
## 15   Twilight_Zone
## 16       Toy_Story
## 17       Under_Sea
## 18    mention_ride
## 19 no_mention_ride
colSums(paris[c(4:22)])
##   Astro_Orbiter     Big_Thunder        Dinosaur           Dumbo Haunted_Mansion 
##               1              27               1              17             141 
##     Small_World   Jungle_Cruise   Mission_Space       Peter_Pan         Pirates 
##             286               8               5              78             601 
##     Rock_Roller  Space_Mountain Splash_Mountain      Star_Tours   Twilight_Zone 
##               5            1241              20             334              16 
##       Toy_Story       Under_Sea    mention_ride no_mention_ride 
##               1               2            1951           10743
# get the avg rating for each ride
df <- data.frame()
for (i in 1:nrow(rides_pr)) { 
  for (k in rides_pr[i,]) { 
    avg_rating <- paris %>%
        filter(paris[,k] != 0) %>%
        summarise(avg_rating = mean(Rating))
    df <- rbind(df, avg_rating)
  } 
}

# ride names, counts, and average ratings
ride_mention_count <- data.frame(rides = names(colSums(paris[c(4:22)])), 
                                 counts = as.numeric(colSums(paris[c(4:22)])),
                                 avg_raing = df)

# Frequency of each ride in total branch reviews & in reviews that mentioned rides in the branch
freq_all <- ride_mention_count %>%
  mutate(freq_branch_pct = counts/nrow(paris)*100, 
         freq_Mention_pct = counts/counts[18]*100) %>%
  mutate_at(c(3:5), round, digits = 2)%>%
  arrange(desc(freq_Mention_pct)) 
freq_all
##              rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1  no_mention_ride  10743       3.98           84.63           550.64
## 2     mention_ride   1951       3.97           15.37           100.00
## 3   Space_Mountain   1241       4.01            9.78            63.61
## 4          Pirates    601       3.96            4.73            30.80
## 5       Star_Tours    334       3.96            2.63            17.12
## 6      Small_World    286       4.17            2.25            14.66
## 7  Haunted_Mansion    141       3.73            1.11             7.23
## 8        Peter_Pan     78       4.18            0.61             4.00
## 9      Big_Thunder     27       3.89            0.21             1.38
## 10 Splash_Mountain     20       3.95            0.16             1.03
## 11           Dumbo     17       4.18            0.13             0.87
## 12   Twilight_Zone     16       3.88            0.13             0.82
## 13   Jungle_Cruise      8       3.88            0.06             0.41
## 14   Mission_Space      5       4.20            0.04             0.26
## 15     Rock_Roller      5       3.40            0.04             0.26
## 16       Under_Sea      2       5.00            0.02             0.10
## 17   Astro_Orbiter      1       4.00            0.01             0.05
## 18        Dinosaur      1       3.00            0.01             0.05
## 19       Toy_Story      1       5.00            0.01             0.05
# proportions of reviews that mentioned rides & no mention rides in the branch
p_mention_orno <- freq_all[1:2, 1:4]
p_mention_orno
##             rides counts avg_rating freq_branch_pct
## 1 no_mention_ride  10743       3.98           84.63
## 2    mention_ride   1951       3.97           15.37
#                 avg_rating  frequency %
# HK: no_mention:   4.21        88.98
#     mentioned:    4.26        11.02
# CA: no_mention:   4.43        83.08   
#     mentioned:    4.36        16.92


p_mention_orno%>%
  ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  labs(fill = "Ride Experience Mentioned") +
  ggtitle("Reviews that mentioned & did not mention ride experiences in Paris") +
  theme_void()+
  geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
  scale_fill_manual(values=c("#4169E2", "#BACBFF"), 
                    labels=c("Mentioned", "Did not mention"))

84.63% did not mention ride experiences in PR branch with an average rating of 3.98; 15.37% mentioned ride experiences in PR branch with a very similar average rating of 3.97. Comparing to the two branches before, the two average ratings are much lower than in HK and CA. This might suggest that Paris branch receive generally lower ratings compared to other branches. The proportions of rides mentioned in the review text verses did not are similar compared to other two branches.

# proportions of rides mentioned in the branch
# over total number of reviews in the branch & over reviews that mentioned rides in branch
p_rides <- freq_all[3:19, 1:5]

# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_rides <- p_rides %>%
  filter(freq_Mention_pct >= 2) 
hot_rides
##             rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1  Space_Mountain   1241       4.01            9.78            63.61
## 2         Pirates    601       3.96            4.73            30.80
## 3      Star_Tours    334       3.96            2.63            17.12
## 4     Small_World    286       4.17            2.25            14.66
## 5 Haunted_Mansion    141       3.73            1.11             7.23
## 6       Peter_Pan     78       4.18            0.61             4.00
ride_rank_plot <- hot_rides%>%
  ggplot(aes(x=reorder(rides,freq_Mention_pct), 
             y=freq_Mention_pct, fill=freq_Mention_pct))+
  geom_col(position='dodge')+
  coord_flip()+
  labs(x = "", y = "Frequency in %") +
  ggtitle("All rides mentioned in Paris with % distribution")+
  scale_fill_gradient(low = "#D4DFFF", high = "#083DDC", guide = FALSE) +
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.x  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.title = element_text(size = 15, hjust = -0.267, vjust = 1)) 


rating_rank_plot <- hot_rides %>%
  ggplot() +
  geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct), 
           width=0.7, position='dodge')+  
  scale_y_continuous(name = "Frequency in %", 
                     expand = c(0,0),
                     limits = c(-10, 80)) +
  geom_text(aes(x=rides, y=-5, label=avg_rating), size = 4.5) +
  labs(x = "Mostly mentioned rides", y = "Frequency in %") +
  ggtitle("Average rating ranked with popular rides & no ride") +
  scale_fill_gradient(low = "#D4DFFF", high = "#083DDC", guide = FALSE)+
  labs(tag ="Ratings:")+  
  theme(plot.background = element_blank(), 
        panel.background = element_blank(), 
        panel.grid.major.y  = element_line(color = "light grey"),
        axis.line = element_line(color = "black"),
        plot.tag.position = c(0.01,0.115),
        plot.tag = element_text(hjust = 0, size=12.5),
        plot.title = element_text(size = 15)) + 
  geom_vline(xintercept = 3.5, linetype = "dashed") +
  geom_text(aes(label = "No rides mentioned", x = 3.5, y=70), angle = 30, color = "#7C7C7C", size = 4)+
  geom_text(aes(label = "3.98", x = 3.5, y=-5), size = 4)

grid.arrange(ride_rank_plot, rating_rank_plot, ncol = 2)

The mostly mentioned ride in Paris branch is also Space_Mountain ride, following by Pirates and Star_Tours. As mentioned before, the general rating for Paris branch is much lower than other two branches. This is also true for the reviews regardless of mentioning or not mentioning ride experiences. Peter_Pan and Small_World received similarly higher average ratings, following by Space_Mountain. Reviewers who did not mention any ride rated slightly higher on average than other rides including Pirates, Star_Tours, and Haunted Mansion. This might suggest that apart from Peter_Pan and Small_World, rides that are most frequently mentioned in Paris branch might tend to receive more neutral ratings. People tend not to give higher ratings when they mention rides that are popular or mostly mentioned compared with the ratings without mentioning rides, in which case, these most mentioned rides tend not to be more favorable among visitors.

Sentiment Analysis

Hong Kong

Integrate the three branches and attach to original dataset

# rbind all 3 branches 
tri_branch <- rbind(hongkong[,20:21], cali[,24:25], paris[,21:22])
# cbind indicators to original dataset
disneyland <- cbind(disneyland, tri_branch)

Distribution of sentiment scores with ride experience

dis_w_shop <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', mention_ride == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)

agg_data <- aggregate(dis_w_shop[, 'value'], by = list(dis_w_shop$Review_ID), FUN = function(x) mean(x))

colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
  ggplot(aes(x=mean_value,fill=mean_value>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Hongkong Branch (contain ride experience)")

Distribution of sentiment scores without ride experience

dis_wt_shop <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', mention_ride == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)
agg_data <- aggregate(dis_wt_shop[, 'value'], by = list(dis_wt_shop$Review_ID), FUN = function(x) mean(x))

colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
  ggplot(aes(x=mean_value,fill=mean_value>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Hongkong Branch (not contain ride experience)")

California

Distribution of sentiment scores with ride experience

dis_w_shop <- disneyland %>%
  filter(Branch == 'Disneyland_California', mention_ride == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)

agg_data <- aggregate(dis_w_shop[, 'value'], by = list(dis_w_shop$Review_ID), FUN = function(x) mean(x))

colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
  ggplot(aes(x=mean_value,fill=mean_value>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in California Branch (contain ride experience)")

Distribution of sentiment scores without ride experience

dis_wt_shop <- disneyland %>%
  filter(Branch == 'Disneyland_California', mention_ride == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)
agg_data <- aggregate(dis_wt_shop[, 'value'], by = list(dis_wt_shop$Review_ID), FUN = function(x) mean(x))

colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
  ggplot(aes(x=mean_value,fill=mean_value>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in California Branch (not contain ride experience)")

Paris

Distribution of sentiment scores with ride experience

dis_w_shop <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', mention_ride == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)

agg_data <- aggregate(dis_w_shop[, 'value'], by = list(dis_w_shop$Review_ID), FUN = function(x) mean(x))

colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
  ggplot(aes(x=mean_value,fill=mean_value>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Paris Branch (contain ride experience)")

Distribution of sentiment scores without ride experience

dis_wt_shop <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', mention_ride == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn)
agg_data <- aggregate(dis_wt_shop[, 'value'], by = list(dis_wt_shop$Review_ID), FUN = function(x) mean(x))

colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
  ggplot(aes(x=mean_value,fill=mean_value>0))+
  geom_histogram(binwidth = 0.1)+
  scale_x_continuous(breaks=seq(-5,5,1))+
  scale_fill_manual(values=c('tomato','seagreen'))+
  theme_bw() +
  labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
  ggtitle("Distribution of Review Sentiment in Paris Branch (not contain ride experience)")

Overall Sentiment Analytis

Calculate proportions and average scores required

##### hong kong#####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', mention_ride == 1) %>%
  select(Review_ID, Review_Text) %>%
  unnest_tokens(output = word, input = Review_Text) %>%
  inner_join(afinn) %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / NROW(word))

#  proportion of positive sentiment without shopping words
pos_prop_wt_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', mention_ride == 0) %>%
  select(Review_ID, Review_Text) %>%
  unnest_tokens(output = word, input = Review_Text) %>%
  inner_join(afinn) %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / NROW(word))


# average sentiment score with shopping words
mean_w_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', mention_ride == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

# average sentiment score without shopping words
mean_wt_shop_hk <- disneyland %>%
  filter(Branch == 'Disneyland_HongKong', mention_ride == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))


##### california #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', mention_ride == 1) %>%
  select(Review_ID, Review_Text) %>%
  unnest_tokens(output = word, input = Review_Text) %>%
  inner_join(afinn) %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / NROW(word))

#  proportion of positive sentiment without shopping words
pos_prop_wt_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', mention_ride == 0) %>%
  select(Review_ID, Review_Text) %>%
  unnest_tokens(output = word, input = Review_Text) %>%
  inner_join(afinn) %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / NROW(word))

# average sentiment score with shopping words
mean_w_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', mention_ride == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

# average sentiment score without shopping words
mean_wt_shop_ca <- disneyland %>%
  filter(Branch == 'Disneyland_California', mention_ride == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))



##### paris #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', mention_ride == 1) %>%
  select(Review_ID, Review_Text) %>%
  unnest_tokens(output = word, input = Review_Text) %>%
  inner_join(afinn) %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / NROW(word))

#  proportion of positive sentiment without shopping words
pos_prop_wt_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', mention_ride == 0) %>%
  select(Review_ID, Review_Text) %>%
  unnest_tokens(output = word, input = Review_Text) %>%
  inner_join(afinn) %>%
  mutate(positive = ifelse(value >= 0, 1, 0)) %>%
  summarize(proportion_positive = sum(positive) / NROW(word))

# average sentiment score with shopping words
mean_w_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', mention_ride == 1) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

# average sentiment score without shopping words
mean_wt_shop_paris <- disneyland %>%
  filter(Branch == 'Disneyland_Paris', mention_ride == 0) %>%
  select(Review_ID,Review_Text)%>%
  group_by(Review_ID)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  inner_join(afinn) %>%
  ungroup() %>%
  summarize(avg = mean(value))

Proportion of positive words in reviews w/wt shopping words

# make proportion plot
df_prop = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
                     prop_pos_words_w_shop = c(as.numeric(pos_prop_w_shop_hk), as.numeric(pos_prop_w_shop_ca), as.numeric(pos_prop_w_shop_paris)),
                     prop_pos_words_wt_shop = c(as.numeric(pos_prop_wt_shop_hk), as.numeric(pos_prop_wt_shop_ca), as.numeric(pos_prop_wt_shop_paris)))
df_prop_long <- tidyr::pivot_longer(df_prop, cols = c("prop_pos_words_w_shop", "prop_pos_words_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_prop_long, aes(x = branch, y = proportion, fill = category)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
  scale_fill_manual(values = c("#0072B2", "#F0E442"), 
                    labels=c("Mentioned", "Did not mention")) +
  labs(title = "Proportion of Positive Words with and without ride experience", x = "Branch", y = "Proportion", fill = 'Ride experience mentioned') +
  geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 14, face = "bold"),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12))

Average sentiment scores of reviews w/wt shopping words

# make average score plot
df_mean = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
                     mean_w_shop = c(as.numeric(mean_w_shop_hk), as.numeric(mean_w_shop_ca), as.numeric(mean_w_shop_paris)),
                     mean_wt_shop = c(as.numeric(mean_wt_shop_hk), as.numeric(mean_wt_shop_ca), as.numeric(mean_wt_shop_paris)))
df_mean_long <- tidyr::pivot_longer(df_mean, cols = c("mean_w_shop", "mean_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_mean_long, aes(x = branch, y = proportion, fill = category)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
  scale_fill_manual(values = c("#0072B2", "#F0E442"), 
                    labels=c("Mentioned", "Did not mention")) +
  labs(title = "Average sentiment scores with and without ride experience", x = "Branch", y = "Average sentiment score", fill = 'Ride experience mentioned') +
  geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_text(size = 14, face = "bold"),
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12))

Regarding sentiment analysis, the difference in proportion of positive words between reviews with ride experience or not is ambiguous, but the average sentiment score for ride-related reviews are much lower, indicating that reviewers may have more extreme emotions toward ride experience. Thus, every branch should examine each of their ride services and make improvements accordingly.

Research Question No.3

We classified the topic words in the reviews into four categories: time, theme rides, dining, and customer services. We aim to find which specific topic words/word categories reviewers share in their reviews when they tend to provide a high & low rating.

Hypothesis 1:
Ho: Reviews discussing time words would not affect overall review ratings.
Ha: Reviews discussing time words would affect overall review ratings.
Hypothesis 2:
Ho: Reviews discussing food words would not affect overall review ratings.
Ha: Reviews discussing food words would affect overall review ratings.
Hypothesis 3:
Ho: Rides features would not affect overall review ratings.
Ha: Rides features would affect overall review ratings. Hypothesis 4:
Ho: Reviews discussing staff would not affect overall review ratings.
Ha: Reviews discussing staff would affect overall review ratings.

Clean and Tokenize

We created a corpus of the reviews from the variable Review_Text using library(tm), and then preprocessed the corpus of reviews using functions from library(tm) as well. Specifically, we transformed text to lower case, removed punctuation, removed English stopwords using the dictionary tm::stopwords(“english”), removed numbers, remove whitespace, and stemed documents. We also retained all terms that appear in 5% or more reviews and converted the document-term-matrix to a data frame so that the column names would be the tokens.

#head(disneyland)
#install.packages("tm")
library(tm)
#install.packages('topicmodels')
library(topicmodels)

#Clean the Review_Text column:
corpus1 <- Corpus(VectorSource(disneyland$Review_Text))

#(a) transform text to lower case,
corpus1 = tm_map(corpus1,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus1, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus1 = tm_map(corpus1,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus1 = tm_map(corpus1,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus1, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus1 = tm_map(corpus1,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus1 = tm_map(corpus1,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus1 = tm_map(corpus1,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = stemDocument): transformation
## drops documents
dict1 = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
                     lowfreq = 0)
dict_corpus1 = Corpus(VectorSource(dict1))


dtm1 <- DocumentTermMatrix(corpus1)

# Remove Sparse Terms 
xdtm1 = removeSparseTerms(dtm1,sparse = 0.95)
 #xdtm

# Complete Stems
xdtm1 = as.data.frame(as.matrix(xdtm1))
colnames(xdtm1) = stemCompletion(x = colnames(xdtm1),
                                dictionary = dict_corpus1,
                                type='prevalent')
colnames(xdtm1) = make.names(colnames(xdtm1))
#head(sort(colSums(xdtm1),decreasing = T))

We also created another document term matrix using Term Frequency - Inverse Document Frequency Weighting.

# Document Term Matrix - tfidf
dtm_tfidf1 = DocumentTermMatrix(x=corpus1,
                               control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf1 = removeSparseTerms(dtm_tfidf1,sparse = 0.95)
xdtm_tfidf1 = as.data.frame(as.matrix(xdtm_tfidf1))
colnames(xdtm_tfidf1) = stemCompletion(x = colnames(xdtm_tfidf1),
                                      dictionary = dict_corpus1,
                                      type='prevalent')
colnames(xdtm_tfidf1) = make.names(colnames(xdtm_tfidf1))
#head(sort(colSums(xdtm_tfidf1),decreasing = T))

Then we plotted a bar chart which contrasts the weights of term frequency and term frequency inverse document frequency weighting for the top 35 terms.

library(tidyr); library(dplyr); library(ggplot2); library(ggthemes);library(RColorBrewer)
data.frame(term = colnames(xdtm1),tf = colMeans(xdtm1), tfidf = colMeans(xdtm_tfidf1))%>%
  arrange(desc(tf))%>%
  top_n(35)%>%
  gather(key=weighting_method,value=weight,2:3)%>%
  ggplot(aes(x=term,y=weight,fill=weighting_method))+
  geom_col(position='dodge')+
  coord_flip()+
  theme_economist()+ 
  scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

After tokenizing the reviews’ word frequency as a whole, we could see that reviewers mentioned factors, such as time, rides, and food, very frequently, no matter using Term Frequency - Inverse Document Frequency Weighting or not. Therefore, topic word categories can be considered as time, theme rides, dining, and customer services, and topic words are time, rides, staff, and food. These motivated us to further separate ratings and sentiments to inspect words in negative-rating reviews (ratings under 3).

We created two document term matrices using both Term frequency and Term Frequency - Inverse Document Frequency Weighting for the negative-rating reviews and plotted a bar chart for the top 30 terms to compare two weightings. Similar to the bar chart of all-rating reviews above, Terms “ride” and “park” are heavy weighted. Term Frequency assigns them a heavy weight because they are the most frequently occurring term, and they appear in most of the reviews as well, while Term Frequency - Inverse Document Frequency assigns them a much lower weight. Topic word “food” is in the collection as well. Nevertheless, term “show” is not included while “staff” appears. And terms related to time are included a lot, such as “wait,” “minute,” and “hour.”

negativeDisney<-disneyland %>% subset(Rating == c(1,2))
## Warning in Rating == c(1, 2): longer object length is not a multiple of shorter
## object length
# Create a document-term matrix for the negative sentiment words

#Clean the Review_Text column:
corpus <- Corpus(VectorSource(negativeDisney$Review_Text))

#(a) transform text to lower case,
corpus = tm_map(corpus,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus = tm_map(corpus,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus = tm_map(corpus,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus = tm_map(corpus,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus = tm_map(corpus,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stemDocument): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
 #inspect(dtm)

dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(negativeDisney$Review_Text))),
                     lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))

# Remove Sparse Terms 
xdtm = removeSparseTerms(dtm,sparse = 0.95)
 #xdtm

# Complete Stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
                                dictionary = dict_corpus,
                                type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
#head(sort(colSums(xdtm),decreasing = T))

# Document Term Matrix - tfidf
dtm_tfidf = DocumentTermMatrix(x=corpus,
                               control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf = removeSparseTerms(dtm_tfidf,sparse = 0.95)
xdtm_tfidf = as.data.frame(as.matrix(xdtm_tfidf))
colnames(xdtm_tfidf) = stemCompletion(x = colnames(xdtm_tfidf),
                                      dictionary = dict_corpus,
                                      type='prevalent')
colnames(xdtm_tfidf) = make.names(colnames(xdtm_tfidf))
#head(sort(colSums(xdtm_tfidf),decreasing = T))
data.frame(term = colnames(xdtm),tf = colMeans(xdtm), tfidf = colMeans(xdtm_tfidf))%>%
  arrange(desc(tf))%>%
  top_n(30)%>%
  gather(key=weighting_method,value=weight,2:3)%>%
  ggplot(aes(x=term,y=weight,fill=weighting_method))+
  geom_col(position='dodge')+
  coord_flip()+
  theme_economist()+ 
  scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

After zooming into the negative-rating reviews, we would like to take a look at the neutral-rating and positive rating reviews as well. For neutral-rating reviews (rating=3), we plotted the bar chart again to contrast the weights of term frequency and term frequency inverse document frequency weighting for the top 30 terms. Terms “ride” and “park” are heavy weighted again. Time words and food remain as two of the focuses. Term “staff” is still in the top 30, but the term “show” appears this time.

neutralDisney<-disneyland %>% subset(Rating == 3)
# Create a document-term matrix for the negative sentiment words

#Clean the Review_Text column:
corpus <- Corpus(VectorSource(neutralDisney$Review_Text))

#(a) transform text to lower case,
corpus = tm_map(corpus,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus = tm_map(corpus,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus = tm_map(corpus,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus = tm_map(corpus,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus = tm_map(corpus,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stemDocument): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
 #inspect(dtm)

dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(neutralDisney$Review_Text))),
                     lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))

# Remove Sparse Terms 
xdtm = removeSparseTerms(dtm,sparse = 0.95)
 #xdtm

# Complete Stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
                                dictionary = dict_corpus,
                                type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
#head(sort(colSums(xdtm),decreasing = T))

# Document Term Matrix - tfidf
dtm_tfidf = DocumentTermMatrix(x=corpus,
                               control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf = removeSparseTerms(dtm_tfidf,sparse = 0.95)
xdtm_tfidf = as.data.frame(as.matrix(xdtm_tfidf))
colnames(xdtm_tfidf) = stemCompletion(x = colnames(xdtm_tfidf),
                                      dictionary = dict_corpus,
                                      type='prevalent')
colnames(xdtm_tfidf) = make.names(colnames(xdtm_tfidf))
#head(sort(colSums(xdtm_tfidf),decreasing = T))

data.frame(term = colnames(xdtm),tf = colMeans(xdtm), tfidf = colMeans(xdtm_tfidf))%>%
  arrange(desc(tf))%>%
  top_n(30)%>%
  gather(key=weighting_method,value=weight,2:3)%>%
  ggplot(aes(x=term,y=weight,fill=weighting_method))+
  geom_col(position='dodge')+
  coord_flip()+
  theme_economist()+ 
  scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

We also plotted term frequency versus term frequency inverse document frequency weighting top-30-term bar chart for the positive-rating reviews as shown below. We could see that terms like “love” and “enjoy” appear in the top 30, while time words

positiveDisney<-disneyland %>% subset(Rating == c(4,5))
## Warning in Rating == c(4, 5): longer object length is not a multiple of shorter
## object length
# Create a document-term matrix for the negative sentiment words

#Clean the Review_Text column:
corpus <- Corpus(VectorSource(positiveDisney$Review_Text))

#(a) transform text to lower case,
corpus = tm_map(corpus,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus = tm_map(corpus,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus = tm_map(corpus,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus = tm_map(corpus,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus = tm_map(corpus,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stemDocument): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
 #inspect(dtm)

dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(positiveDisney$Review_Text))),
                     lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))

# Remove Sparse Terms 
xdtm = removeSparseTerms(dtm,sparse = 0.95)
 #xdtm

# Complete Stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
                                dictionary = dict_corpus,
                                type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
#head(sort(colSums(xdtm),decreasing = T))

# Document Term Matrix - tfidf
dtm_tfidf = DocumentTermMatrix(x=corpus,
                               control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf = removeSparseTerms(dtm_tfidf,sparse = 0.95)
xdtm_tfidf = as.data.frame(as.matrix(xdtm_tfidf))
colnames(xdtm_tfidf) = stemCompletion(x = colnames(xdtm_tfidf),
                                      dictionary = dict_corpus,
                                      type='prevalent')
colnames(xdtm_tfidf) = make.names(colnames(xdtm_tfidf))
#head(sort(colSums(xdtm_tfidf),decreasing = T))

data.frame(term = colnames(xdtm),tf = colMeans(xdtm), tfidf = colMeans(xdtm_tfidf))%>%
  arrange(desc(tf))%>%
  top_n(30)%>%
  gather(key=weighting_method,value=weight,2:3)%>%
  ggplot(aes(x=term,y=weight,fill=weighting_method))+
  geom_col(position='dodge')+
  coord_flip()+
  theme_economist()+ 
  scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

After plotting three term frequency versus term frequency inverse document frequency weighting top-30-term bar charts, we found that term “staff” only appears in top 30 of negative and neutral rating reviews. Term “food,” topic words related to “time,” and term “ride” are shown in all three types of rating reviews. Therefore, we would like to further investigate these terms separately.

Term “Food”

Based on previous analysis, term “food” is a focus over all rating reviews.

# create a vector of food-related keywords
keywords <- c('food','foods', 'eating','eat','ate', 'restaurant', 'meal','drink','snack','churro','cake','turkey','corn','popcorn','pretzel','pizza','taco', 'burger','sandwiche','milkshake','fries', 'chicken','beef','pork', 'beer','wine','breakfast','lunch','dinner')

# create a logical vector indicating whether each Review_Text contains at least one keyword
contains_keyword <- grepl(paste0("\\b", keywords, "\\b", collapse = "|"), disneyland$Review_Text, ignore.case = TRUE)

# subset the original dataframe to create a new dataframe allFoods containing only rows with Review_text that contains at least one keyword
allFoods <- disneyland[contains_keyword, ]
str(allFoods)
## 'data.frame':    13639 obs. of  12 variables:
##  $ Review_ID        : int  670623270 670607911 670591897 670574142 670571027 670570869 670274554 670205135 670129921 670007081 ...
##  $ Rating           : int  4 4 3 3 2 5 5 3 3 3 ...
##  $ Year_Month       : chr  "Apr 2019" "Apr 2019" "Apr 2019" "Mar 2019" ...
##  $ Reviewer_Location: chr  "United Arab Emirates" "Australia" "Singapore" "Malaysia" ...
##  $ Review_Text      : chr  "Thanks God it wasn   t too hot or too humid when I was visiting the park   otherwise it would be a big issue (t"| __truncated__ "HK Disneyland is a great compact park. Unfortunately there is quite a bit of maintenance work going on at prese"| __truncated__ "Have been to Disney World, Disneyland Anaheim and Tokyo Disneyland but I feel that Disneyland Hong Kong is real"| __truncated__ "Think of it as an intro to Disney magic for the little ones. Almost all of the attractions can be completed in "| __truncated__ ...
##  $ Branch           : chr  "Disneyland_HongKong" "Disneyland_HongKong" "Disneyland_HongKong" "Disneyland_HongKong" ...
##  $ Year             : int  2019 2019 2019 2019 2019 2019 2018 2019 2019 2019 ...
##  $ Month            : int  4 4 4 3 4 3 9 1 4 4 ...
##  $ Rating_type      : chr  "positive" "positive" "neutral" "neutral" ...
##  $ continent        : chr  "Asia" "Oceania" "Asia" "Asia" ...
##  $ mention_ride     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_mention_ride  : num  1 1 1 1 1 1 1 1 1 1 ...

We found out the total number of positive and negative words in the reviews and plotted bar charts.

allFoods%>%
  group_by(Review_ID)%>%
  unnest_tokens(output = word, input = Review_Text)%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(sentiment)%>%
  count()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 57961 of `x` matches multiple rows in `y`.
## ℹ Row 3619 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## # A tibble: 2 × 2
## # Groups:   sentiment [2]
##   sentiment      n
##   <chr>      <int>
## 1 negative   54280
## 2 positive  122195
# bar chart for positive and negative sentiments in reviews mentioned foods over 5 ratings using 'bing' dictionary
allFoods %>%
  select(Review_ID,Review_Text,Rating)%>%
  group_by(Review_ID, Rating)%>%
  unnest_tokens(output=word,input=Review_Text)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(Rating,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=Rating,y=proportion,fill=sentiment))+
  geom_col()+
  theme_economist()+
  coord_flip()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 57961 of `x` matches multiple rows in `y`.
## ℹ Row 3619 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## `summarise()` has grouped output by 'Rating'. You can override using the
## `.groups` argument.

# bar chart for Rating Distribution for Reviews Mentioned Foods
ggplot(allFoods, aes(Rating))+
  geom_bar(stat = "count",position = 'dodge', fill="mistyrose3")+
  ggtitle("Rating Distribution for Reviews Mentioned Foods")

Considering that the reviews contains comments about other aspects, such as rides and shows, we further examine sentences mentioned foods by selecting foods-related sentences and stored in a new dataframe foodSentences_ratings, which contains combined food-related sentences (one reviewer may has two or more sentences), Review_ID, and Rating.

# Split the Review_Text column into sentences 
allFoods_sentences <- allFoods %>%
  unnest_tokens(output=sentence, input=Review_Text, "sentences")

# Filter the sentences containing at least one food word 
foodSentences <- allFoods_sentences %>%
  filter(grepl(paste(keywords, collapse="|"), sentence))

dim(foodSentences)
## [1] 41367    12
#head(foodSentences)

# Combine the sentences for each unique Reviewer_ID
foodSentences_combined <- aggregate(sentence ~ Review_ID, data = foodSentences, FUN = paste, collapse = " ")
    #foodSentences_combined

# Join the foodSentences_combined and allFoods dataframes by Reviewer_ID
foodSentences_ratings <- left_join(foodSentences_combined, allFoods[, c("Review_ID", "Rating")], by = "Review_ID")

dim(foodSentences_ratings)
## [1] 13639     3
#head(foodSentences_ratings)

We used “bing” dictionary and ‘nrc’ lexicon to examine the sentiments in the food sentences.

#plot a bar chart with "bing" dictionary
foodSentences_ratings%>%
  group_by(Review_ID)%>%
  unnest_tokens(output = word, input = sentence)%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(sentiment)%>%
  count()%>%
  ggplot(aes(x=sentiment,y=n,fill=sentiment))+
  geom_col()+
  theme_economist()+
  guides(fill=F)+
  coord_flip()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 970771 of `x` matches multiple rows in `y`.
## ℹ Row 2698 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# plot another bar chart based on ratings
foodSentences_ratings %>%
  select(Review_ID,sentence,Rating)%>%
  group_by(Review_ID, Rating)%>%
  unnest_tokens(output=word,input=sentence)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(Rating,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=Rating,y=proportion,fill=sentiment))+
  geom_col()+
  theme_economist()+
  coord_flip()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 970771 of `x` matches multiple rows in `y`.
## ℹ Row 2698 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## `summarise()` has grouped output by 'Rating'. You can override using the
## `.groups` argument.

# plot a bar chart with ‘nrc’ lexicon 
library(lexicon)
nrc<-read.csv("nrc.csv")

foodSentences_ratings%>%
  group_by(Review_ID)%>%
  unnest_tokens(output = word, input = sentence)%>%
  inner_join(nrc)%>%
  group_by(sentiment)%>%
  count()%>%
  ggplot(aes(x=reorder(sentiment,X = n), y=n, fill=sentiment))+
  geom_col()+
  guides(fill=F)+
  coord_flip()+
  theme_wsj()
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 14 of `x` matches multiple rows in `y`.
## ℹ Row 5150 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

From the plots above, we can see that overall most reviewers maintain a positive attitude towards foods. The number of positive sentiment is about twice amount of number of negative sentiment. The number of positive emotions based on “nrc” lexicon is also relatively high. Therefore, we further zoomed into negative rating sentences to investigate the emotions expressed about foods in the low ratings (Rating == c(1,2)).

negativeFoods<-foodSentences_ratings %>% subset(Rating == c(1,2))
## Warning in Rating == c(1, 2): longer object length is not a multiple of shorter
## object length
#head(negativeFoods)

negativeFoods%>%
  group_by(Review_ID)%>%
  unnest_tokens(output = word, input = sentence)%>%
  inner_join(nrc)%>%
  group_by(sentiment)%>%
  count()%>%
  arrange(desc(n))
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10 of `x` matches multiple rows in `y`.
## ℹ Row 4201 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## # A tibble: 10 × 2
## # Groups:   sentiment [10]
##    sentiment        n
##    <chr>        <int>
##  1 positive      3452
##  2 trust         2073
##  3 negative      2029
##  4 joy           1849
##  5 anticipation  1763
##  6 sadness        922
##  7 anger          907
##  8 fear           863
##  9 surprise       756
## 10 disgust        744
#plot a bar chart with "bing" dictionary
negativeFoods%>%
  group_by(Review_ID)%>%
  unnest_tokens(output = word, input = sentence)%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(sentiment)%>%
  count()%>%
  ggplot(aes(x=sentiment,y=n,fill=sentiment))+
  geom_col()+
  theme_economist()+
  guides(fill=F)+
  coord_flip()
## Joining with `by = join_by(word)`

# plot a bar chart with ‘nrc’ lexicon 
negativeFoods%>%
  group_by(Review_ID)%>%
  unnest_tokens(output = word, input = sentence)%>%
  inner_join(nrc)%>%
  group_by(sentiment)%>%
  count()%>%
  ggplot(aes(x=reorder(sentiment,X = n), y=n, fill=sentiment))+
  geom_col()+
  guides(fill=F)+
  coord_flip()+
  theme_wsj()
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10 of `x` matches multiple rows in `y`.
## ℹ Row 4201 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

The plots above show that in the negative ratings (Ratings under 3), most reviewers hold a negative attitude towards foods, which means that foods-related things may play a role in low ratings. Below we checked 20 reviews mentioned foods in low ratings.

# we checked 20 reviews, but only show 6 here.
head(negativeFoods$sentence, 3)
## [1] "it was absolutely freezing which obviously is not disney's fault but it was snowing when we arrived which was quite magical but the snow settled on ice and previous snow which basically created a lethal surface with which to walk on! in fairness the staff were very pleasant at disney but i think they knew they were very wrong in not cleaning putting something down to make floor less slippery.it ruined my holiday but i think the children still enjoyed themselves...favourites were buzz lightyear ride, peter pan ride and live animated stitch.glad i looked on here before we went for tips.. take rolls batches cheese and ham from buffet breakfast to eat throughout the day.we actually took a suitcase of our own food drink which was a life saver.rainforest cafe was great to eat...everywhere is expensive but you get an experience with this too!"                                                                                                                                                                                                                                                 
## [2] "we spent the 20 dec 2010 in the disney park and by lunchtime attractions were starting to close or break down. we walked to mgm studios and easily got in an into the first building which was populated with shops and food outlets. the open air magic carpet ride was open but it was left to the riders to clear the deep snow from the seats. people headed for the indoor attractions to find that under cover marshalling areas were closed on the pretext of dangerous snow levels on roofs and people were being advised to return later. fast pass booths were closed presunably because of weather damage. shops and restaurants. these places became almost claustrphobically overpopulated. by lunchtime we had enjoyed only 4 hot chocolates each and one ride for with our grand children so we gave up and retried the disney park. a slight easing of the weather enabled two of the parades to operate but the main wish of our grandkids to see princesses was not achieved. in the end we booked an expensive character dinner just to enable them to meet mickey, minnie and some of the animal characters."
## [3] "the area for the buses, shuttles, taxis and private cars is three narrow lanes right outside of the train station. fortunately we overheard someone talking about our hotel and asked if they were waiting for the shuttle, which they were. funny how he didn't count or organize the people in any manner to avoid the angry, frustrated group that had to either rearrange, or get off the shuttle. this whole ordeal was repeated again every time we took the radisson blu shuttle. every restaurant we went in, with the exception of the hakuna matata, was beyond filthy. overflowing garbage cans, tables with gobs of food on, around and under them and barely a cast member in sight cleaning up."
tail(negativeFoods$sentence, 3)
## [1] "we went to disneyland paris during the easter break with family members who had come over from the states. not a great first impression. not great. we did manage to eat lunch  early  enough that we didn't have to wait a long time. however, the food that we got was not what we ordered. none of us had the wherewithal to go back and try to fix the order. so we just ate what we got. there were plenty of other foodcarts around the park that were empty. not enough staff at the park to operate these."
## [2] "people that do not have that much to spend will be held back, because people with money obviously can skip the lines...absolutely ridiculous.getting something to eat was horrible, again lines everywhere...just to eat someting right..."                                                                                                                                                                                                                                                                        
## [3] "walt disney would be horrified, not enough eating places for the amount of people or rest rooms."

There are many foods-related issues, including over-pricing, not enough and unclean eating places, offering too much unhealthy/poor quality food, and too long waiting time.

Time Words

Considering words related to time were mentioned frequently by reviewers giving negative reviews, we would like to investigate more about time words in the reviews. By using spacyr package that provides an interface to the spaCy natural language processing (NLP) library in Python, we were able to do Named Entity Recognition (NER). NER can be helpful for text classification tasks and identify and classify named entities in text. Through using function spacy_parse, we could access time words through the time entity, which is consisted of “TIME_I” and “TIME_B.” The letters “B” and “I” at the end of each entity type indicate whether the entity is the beginning (“B”), such as “45,” or inside (“I”), such as “minites,” of a multi-word entity.

library(spacyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
#spacy_upgrade()

# Initialize connection to Python spaCy and create automatic Named Entity Recognition features.
#spacy_initialize("en_core_web_sm")
#spacy_install("en_core_web_sm")
#spacy_initialize(model = "en_core_web_sm")

ner_negativeDisney <- spacy_parse(negativeDisney$Review_Text, tag = FALSE, 
                                entity = TRUE,lemma = FALSE) 
## Found 'spacy_condaenv'. spacyr will use this environment
## successfully initialized (spaCy Version: 3.5.1, language model: en_core_web_sm)
## (python options: type = "condaenv", value = "spacy_condaenv")
#glimpse(ner_negativeDisney)


wide_ner_negativeDisney <- dcast(ner_negativeDisney, doc_id ~ entity, value.var="entity")
## Aggregation function missing: defaulting to length
glimpse(wide_ner_negativeDisney)
## Rows: 1,642
## Columns: 36
## $ doc_id        <chr> "text1", "text10", "text100", "text1000", "text1001", "t…
## $ Var.2         <int> 168, 106, 89, 40, 32, 147, 254, 323, 1408, 85, 436, 300,…
## $ CARDINAL_B    <int> 1, 0, 0, 1, 0, 0, 2, 3, 19, 2, 13, 5, 0, 0, 9, 4, 2, 1, …
## $ CARDINAL_I    <int> 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 3, 1, 0, 0, 1, 3, 0, 0, 0,…
## $ DATE_B        <int> 2, 0, 1, 0, 0, 0, 8, 0, 8, 0, 3, 0, 0, 0, 3, 2, 0, 3, 2,…
## $ DATE_I        <int> 2, 0, 0, 0, 0, 0, 6, 0, 11, 0, 4, 0, 0, 0, 1, 2, 0, 4, 4…
## $ EVENT_B       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ EVENT_I       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FAC_B         <int> 2, 1, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1, 0, 0, 3, 1, 1, 1, 0,…
## $ FAC_I         <int> 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0,…
## $ GPE_B         <int> 0, 0, 1, 1, 1, 0, 1, 4, 3, 0, 1, 4, 0, 0, 3, 1, 1, 6, 2,…
## $ GPE_I         <int> 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ LANGUAGE_B    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LAW_B         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LAW_I         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LOC_B         <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 2, 3, 0, 0, 0,…
## $ LOC_I         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0,…
## $ MONEY_B       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MONEY_I       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ NORP_B        <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ NORP_I        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ORDINAL_B     <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 6, 1,…
## $ ORG_B         <int> 1, 1, 1, 0, 0, 3, 0, 4, 4, 1, 2, 1, 0, 2, 3, 5, 5, 4, 0,…
## $ ORG_I         <int> 0, 0, 1, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 1, 2, 0, 2, 4, 0,…
## $ PERCENT_B     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2,…
## $ PERCENT_I     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2,…
## $ PERSON_B      <int> 2, 0, 0, 0, 0, 0, 0, 1, 5, 1, 1, 0, 0, 0, 1, 2, 0, 0, 0,…
## $ PERSON_I      <int> 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ PRODUCT_B     <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PRODUCT_I     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ QUANTITY_B    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ QUANTITY_I    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ TIME_B        <int> 0, 1, 1, 0, 0, 0, 2, 4, 10, 2, 2, 3, 0, 0, 4, 4, 1, 2, 0…
## $ TIME_I        <int> 0, 0, 1, 0, 0, 0, 6, 5, 11, 2, 2, 5, 0, 0, 8, 8, 1, 1, 0…
## $ WORK_OF_ART_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ WORK_OF_ART_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Extract time words. 
time_entities <- subset(ner_negativeDisney, entity == c("TIME_I", "TIME_B")) %>%
  group_by(token) %>% summarize(count = n())%>%
  ungroup()%>%
  arrange(desc(count))%>%top_n(15)
## Warning in entity == c("TIME_I", "TIME_B"): longer object length is not a
## multiple of shorter object length
## Selecting by count
time_entities
## # A tibble: 15 × 2
##    token   count
##    <chr>   <int>
##  1 minutes   290
##  2 hours     229
##  3 hour      183
##  4 an         97
##  5 2          87
##  6 pm         78
##  7 minute     77
##  8 10         56
##  9 45         51
## 10 1          48
## 11 30         47
## 12 a          45
## 13 am         42
## 14 about      41
## 15 3          40
# Plot a barchart for time words
#install.packages("pals")
library("pals")
ggplot(time_entities, aes(x=reorder(token,-count), y=count)) +
  geom_bar(stat = "identity", aes(fill = as.factor(token))) +
  labs(title = "Time Words in Negative Disneyland Reviews", 
       x = "Time words", y = "Count") +
  theme_economist()+ 
  scale_fill_manual(values=as.vector(brewer.set2(20)))

The time plot above shows that hours and minutes are mentioned frequently in negative reviews, so we speculate that many visitors’ waiting time are quite long. However, these are the unit words, we need to construct predictive models to see how time issues related to low ratings.